[med-svn] [kmer-tools] 01/02: Imported Upstream version 0~20141114
Afif Elghraoui
afif-guest at moszumanska.debian.org
Sat May 2 02:29:31 UTC 2015
This is an automated email from the git hooks/post-receive script.
afif-guest pushed a commit to branch master
in repository kmer-tools.
commit b7ab98c4f059a9199ea14ad0a9198c2c455a34c8
Author: عفيف الغراوي <afif at ghraoui.name>
Date: Fri May 1 19:18:35 2015 -0700
Imported Upstream version 0~20141114
---
ESTmapper GSAC.pdf | Bin 0 -> 167434 bytes
ESTmapper GSAC.ppt | Bin 0 -> 128000 bytes
ESTmapper LaTeX/ESTmapper.tex | 1203 +
ESTmapper LaTeX/blurb | 32 +
ESTmapper LaTeX/filter.eps | 242 +
ESTmapper LaTeX/filter.fig | 83 +
ESTmapper LaTeX/hash-tables.tex | 200 +
ESTmapper LaTeX/hit-filtering.tex | 358 +
ESTmapper LaTeX/mRNAfilt.eps | 239 +
ESTmapper LaTeX/mRNAfilt.fig | 80 +
ESTmapper LaTeX/mrna-filter.ps | 978 +
ESTmapper LaTeX/mrna-filter.tex | 164 +
ESTmapper/ESTmapper.pl | 1462 +
ESTmapper/Make.include | 21 +
ESTmapper/configureESTmapper.pl | 416 +
ESTmapper/mergeCounts.C | 52 +
ESTmapper/runConcurrently.pl | 16 +
ESTmapper/scheduler.pm | 103 +
ESTmapper/terminate.C | 160 +
Make.include | 30 +
Make.rules | 316 +
Makefile | 194 +
Makefile.wiki | 1400 +
README.compiling | 104 +
README.leaff | 205 +
README.sim4db | 312 +
atac-driver/Make.include | 17 +
atac-driver/alignOverlap/Make.include | 35 +
atac-driver/alignOverlap/findDifferentScaffold.pl | 129 +
atac-driver/alignOverlap/overlap-annoList.H | 66 +
atac-driver/alignOverlap/overlap-find.C | 259 +
atac-driver/alignOverlap/overlap-matchTree.C | 69 +
atac-driver/alignOverlap/overlap-matchTree.H | 38 +
atac-driver/alignOverlap/overlap-printAnno.C | 95 +
atac-driver/alignOverlap/overlap-process.C | 105 +
atac-driver/alignOverlap/overlap-sort.C | 65 +
atac-driver/alignOverlap/overlap-span.H | 94 +
atac-driver/alignOverlap/overlap-spanTree.H | 125 +
atac-driver/alignOverlap/overlap-stats.H | 96 +
atac-driver/alignOverlap/overlap.C | 181 +
atac-driver/alignOverlap/overlap.H | 82 +
atac-driver/alignOverlap/summarizeDisagree.pl | 100 +
atac-driver/atac.pl | 912 +
atac-driver/chainer/Make.include | 56 +
atac-driver/chainer/halign/halign.C | 556 +
atac-driver/chainer/halign/halign.H | 54 +
atac-driver/chainer/halign/halignDriver.C | 58 +
atac-driver/chainer/halign/halignmodule.C | 60 +
atac-driver/chainer/localalign/GF_ALN_dpaligner.C | 175 +
atac-driver/chainer/localalign/GF_ALN_local.C | 1189 +
atac-driver/chainer/localalign/GF_ALN_local.H | 274 +
.../chainer/localalign/GF_ALN_loverlapper.C | 599 +
atac-driver/chainer/localalign/GF_ALN_overlap.C | 848 +
atac-driver/chainer/localalign/GF_ALN_pieceOlap.C | 529 +
.../localalign/localAlignerInterfacemodule.C | 234 +
atac-driver/chainer/python/AtacDriver.py | 602 +
atac-driver/chainer/python/AtacDriver.txt | 217 +
atac-driver/chainer/python/AtacFile.py | 96 +
atac-driver/chainer/python/DNA.py | 52 +
atac-driver/chainer/python/IdxStore.py | 208 +
atac-driver/chainer/python/MatchRecord.py | 234 +
atac-driver/chainer/python/MyFile.py | 104 +
atac-driver/chainer/python/PerfectRuns.py | 254 +
atac-driver/chainer/python/TrimMatchOverlaps.py | 291 +
atac-driver/chainer/python/UniqueFilter.py | 305 +
atac-driver/chainer/python/dedashMatches.py | 144 +
atac-driver/chainer/python/fillIntraRunGaps.py | 362 +
atac-driver/chainer/python/mkstats.py | 76 +
atac-driver/chainer/python/squeezeIntraRunGaps.py | 512 +
atac-driver/chimera/Make.include | 16 +
atac-driver/chimera/happy-clones-span-clumps.C | 484 +
.../chimera/use-clumps-to-detect-chimera.pl | 138 +
atac-driver/clumpMaker/Make.include | 16 +
atac-driver/clumpMaker/clumpMaker.C | 385 +
atac-driver/config.py | 11 +
atac-driver/gapShifter/Make.include | 30 +
atac-driver/gapShifter/alignUnmapped.C | 176 +
atac-driver/gapShifter/cleanAtac.C | 168 +
atac-driver/gapShifter/coalesceMatches.C | 71 +
atac-driver/gapShifter/correctGaps.C | 301 +
atac-driver/gapShifter/extractSequence.C | 135 +
.../gapShifter/extractUnmapped-sim4dbFixer.pl | 63 +
atac-driver/gapShifter/extractUnmapped.C | 600 +
atac-driver/gapShifter/gapShifter.C | 787 +
.../projectFeatures-test-cases/test-rev.atac | 2 +
.../projectFeatures-test-cases/test.atac | 2 +
.../projectFeatures-test-cases/test.atac.log | 10 +
.../projectFeatures-test-cases/test.ataf | 23 +
.../gapShifter/projectFeatures-test-cases/test.log | 0
atac-driver/gapShifter/projectFeatures.C | 208 +
atac-driver/gapShifter/testAtac.C | 107 +
atac-driver/interscaffold-gaps.pl | 134 +
atac-driver/lengthFilter/Make.include | 16 +
atac-driver/lengthFilter/lengthFilter.C | 118 +
atac-driver/libatac/Make.include | 30 +
atac-driver/libatac/atac.H | 130 +
atac-driver/libatac/atacFeature.C | 123 +
atac-driver/libatac/atacFeature.H | 78 +
atac-driver/libatac/atacFeatureList.C | 119 +
atac-driver/libatac/atacFeatureList.H | 52 +
atac-driver/libatac/atacFile.C | 300 +
atac-driver/libatac/atacFileStreamMerge.C | 172 +
atac-driver/libatac/atacMatch.C | 154 +
atac-driver/libatac/atacMatch.H | 72 +
atac-driver/libatac/atacMatchList.C | 51 +
atac-driver/libatac/atacMatchList.H | 49 +
atac-driver/libatac/atacMatchOrder.C | 218 +
atac-driver/libatac/atacMatchOrder.H | 99 +
atac-driver/libatac/fasta-accessor-test.C | 259 +
atac-driver/libatac/fasta-accessor.H | 213 +
atac-driver/makeplot.pl | 430 +
atac-driver/matchExtender/Make.include | 16 +
atac-driver/matchExtender/match.H | 127 +
atac-driver/matchExtender/matchExtender-dump.C | 125 +
atac-driver/matchExtender/matchExtender-func.C | 551 +
atac-driver/matchExtender/matchExtender.C | 214 +
atac-driver/mismatchCounter/Make.include | 16 +
atac-driver/mismatchCounter/mismatchCounter.C | 208 +
atac-driver/relabel.pl | 125 +
atac-driver/rewriteUIDs.pl | 81 +
atac-driver/run-comparison.pl | 242 +
atac-driver/run-length-histogram.pl | 39 +
atac-driver/run-length-n50.pl | 52 +
atac-driver/runatac.pl | 50 +
atac-driver/statsGenerator/Make.include | 16 +
atac-driver/statsGenerator/stats-to-xls.pl | 551 +
atac-driver/statsGenerator/statsGenerator.C | 755 +
atac-driver/test/uf-test-1f.atac | 10 +
atac-driver/test/uf-test-1r.atac | 10 +
atac-driver/test/uf-test-2.atac | 16 +
atac-driver/test/uf-test-3.atac | 27 +
atac-driver/uniqueFilter/Make.include | 16 +
atac-driver/uniqueFilter/uniqueFilter.C | 842 +
configure.sh | 382 +
leaff/Make.include | 18 +
leaff/blocks.C | 50 +
leaff/dups.C | 154 +
leaff/fragmenter.C | 191 +
leaff/gc.C | 86 +
leaff/leaff.C | 809 +
leaff/partition.C | 208 +
leaff/simseq.C | 227 +
leaff/stats.C | 126 +
libbio/Make.include | 65 +
libbio/alphabet-acgtspace.c | 149 +
libbio/alphabet-colorspace.c | 121 +
libbio/alphabet-generate.c | 134 +
libbio/alphabet.c | 271 +
libbio/alphabet.h | 25 +
libbio/bio++.H | 23 +
libbio/bio.h | 47 +
libbio/halign.c | 467 +
libbio/kmer.C | 497 +
libbio/kmer.H | 160 +
libbio/kmerhuge.H | 396 +
libbio/kmeriface.H | 83 +
libbio/kmertiny.H | 147 +
libbio/merCovering.H | 353 +
libbio/merList.H | 94 +
libbio/mers.h | 63 +
libbio/reversecomplement.c | 44 +
libbio/test/Makefile | 98 +
libbio/test/halign-test.C | 48 +
libbio/test/test-bigmer-msf.C | 222 +
libbio/test/test-setbits.C | 28 +
libkmer/Make.include | 35 +
libkmer/driver-existDB.C | 227 +
libkmer/driver-posDB.C | 287 +
libkmer/existDB-create-from-fasta.C | 271 +
libkmer/existDB-create-from-meryl.C | 230 +
libkmer/existDB-create-from-sequence.C | 271 +
libkmer/existDB-state.C | 205 +
libkmer/existDB.C | 182 +
libkmer/existDB.H | 151 +
libkmer/kmer-mask.C | 716 +
libkmer/merTable.H | 76 +
libkmer/percentCovered.C | 66 +
libkmer/positionDB-access.C | 344 +
libkmer/positionDB-dump.C | 50 +
libkmer/positionDB-file.C | 211 +
libkmer/positionDB-mismatch.C | 388 +
libkmer/positionDB-sort.C | 150 +
libkmer/positionDB.C | 1125 +
libkmer/positionDB.H | 241 +
libkmer/test/Makefile | 115 +
libkmer/test/test-maskonly.C | 110 +
libkmer/test/test-mertable.C | 15 +
libkmer/test/test-rebuild.C | 50 +
libmeryl/Make.include | 15 +
libmeryl/libmeryl.C | 490 +
libmeryl/libmeryl.H | 185 +
libseq/Make.include | 34 +
libseq/fastaFile.C | 585 +
libseq/fastaFile.H | 66 +
libseq/fastaStdin.C | 265 +
libseq/fastaStdin.H | 56 +
libseq/fastqFile.C | 593 +
libseq/fastqFile.H | 66 +
libseq/fastqStdin.C | 276 +
libseq/fastqStdin.H | 60 +
libseq/merStream.C | 84 +
libseq/merStream.H | 99 +
libseq/selftest.C | 53 +
libseq/seqCache.C | 197 +
libseq/seqCache.H | 106 +
libseq/seqFactory.C | 60 +
libseq/seqFactory.H | 33 +
libseq/seqFile.H | 54 +
libseq/seqStore.C | 622 +
libseq/seqStore.H | 120 +
libseq/seqStream.C | 396 +
libseq/seqStream.H | 124 +
libseq/sffFile.C | 208 +
libseq/sffFile.H | 104 +
libseq/test-correctSequence.H | 151 +
libseq/test-merStream.C | 284 +
libseq/test-seqCache.C | 181 +
libseq/test-seqStream.C | 287 +
libseq/test/Makefile | 23 +
libseq/test/test-merstream-speed.C | 52 +
libsim4/Make.include | 78 +
libsim4/sim4.H | 9 +
libsim4/sim4core/CHANGES | 26 +
.../sim4core/GlimmerModels/acceptors.162.neg.icm | Bin 0 -> 44332 bytes
.../sim4core/GlimmerModels/acceptors.162.pos.icm | Bin 0 -> 44332 bytes
libsim4/sim4core/GlimmerModels/donors.162.neg.icm | Bin 0 -> 44332 bytes
libsim4/sim4core/GlimmerModels/donors.162.pos.icm | Bin 0 -> 44332 bytes
libsim4/sim4core/Make.include | 47 +
libsim4/sim4core/Xtend1.C | 574 +
libsim4/sim4core/align.C | 848 +
libsim4/sim4core/exon.H | 178 +
libsim4/sim4core/exon_cores.C | 134 +
libsim4/sim4core/extend.C | 331 +
libsim4/sim4core/glimmerSplice.C | 491 +
libsim4/sim4core/glimmerSplice.H | 36 +
libsim4/sim4core/greedy.C | 358 +
libsim4/sim4core/mspManager.C | 628 +
libsim4/sim4core/mspManager.H | 237 +
libsim4/sim4core/pluri_align.C | 324 +
libsim4/sim4core/poly.C | 571 +
libsim4/sim4core/sim4.H | 652 +
libsim4/sim4core/sim4b1-1.C | 118 +
libsim4/sim4core/sim4b1-2.C | 84 +
libsim4/sim4core/sim4b1-3.C | 116 +
libsim4/sim4core/sim4b1-4.C | 121 +
libsim4/sim4core/sim4b1.C | 333 +
libsim4/sim4core/sim4b1_s.C | 100 +
libsim4/sim4core/sim4b1_s.H | 32 +
libsim4/sim4core/sim4b1a.C | 102 +
libsim4/sim4core/sim4command.C | 282 +
libsim4/sim4core/sim4command.H | 148 +
libsim4/sim4core/sim4defines.H | 50 +
libsim4/sim4core/sim4parameters.C | 56 +
libsim4/sim4core/sim4parameters.H | 208 +
libsim4/sim4core/sim4string.C | 887 +
libsim4/sim4core/sites.C | 820 +
libsim4/sim4core/sites_acceptor.C | 2402 +
libsim4/sim4core/sites_acceptor.H | 11 +
libsim4/sim4core/sites_donor.C | 1537 +
libsim4/sim4core/sites_donor.H | 11 +
libsim4/sim4core/sites_score.C | 2572 ++
libsim4/sim4core/sites_score.H | 13 +
libsim4/sim4core/splice.C | 791 +
libsim4/sim4core/table.C | 174 +
libsim4/sim4core/util.C | 832 +
libsim4/sim4polish/Make.include | 38 +
libsim4/sim4polish/sim4polish-compare.C | 406 +
libsim4/sim4polish/sim4polish-copy.C | 129 +
libsim4/sim4polish/sim4polish-deleteexon.C | 112 +
libsim4/sim4polish/sim4polish-exons.C | 67 +
libsim4/sim4polish/sim4polish-polishtostring.C | 403 +
libsim4/sim4polish/sim4polish-read.C | 181 +
libsim4/sim4polish/sim4polish-stringtopolish.C | 444 +
libsim4/sim4polish/sim4polish-updatescores.C | 233 +
libsim4/sim4polish/sim4polish.C | 34 +
libsim4/sim4polish/sim4polish.H | 287 +
libsim4/sim4polish/sim4polish.pm | 254 +
libsim4/sim4polish/sim4polishBuilder.C | 264 +
libsim4/sim4polish/sim4polishBuilder.H | 43 +
libsim4/sim4polish/sim4polishFile.C | 317 +
libsim4/sim4polish/sim4polishFile.H | 107 +
libsim4/sim4polish/sim4polishList.C | 95 +
libsim4/sim4polish/sim4polishList.H | 37 +
libsim4/sim4polish/sim4polishReader.C | 102 +
libsim4/sim4polish/sim4polishReader.H | 29 +
libsim4/sim4polish/sim4polishWriter.C | 181 +
libsim4/sim4polish/sim4polishWriter.H | 62 +
libutil/Make.include | 62 +
libutil/NOTES | 10 +
libutil/bigQueue.C | 343 +
libutil/bigQueue.H | 150 +
libutil/bitOperations.h | 157 +
libutil/bitPackedArray.C | 100 +
libutil/bitPackedArray.H | 318 +
libutil/bitPackedFile.C | 473 +
libutil/bitPackedFile.H | 127 +
libutil/bitPacking.h | 510 +
libutil/bzipBuffer.C | 238 +
libutil/bzipBuffer.H | 92 +
libutil/eliasDeltaEncoding.h | 33 +
libutil/eliasGammaEncoding.h | 33 +
libutil/endianess.H | 64 +
libutil/fibonacciEncoding.h | 171 +
libutil/fibonacciNumbers.C | 108 +
libutil/file.c | 446 +
libutil/generalizedUnaryEncoding.h | 116 +
libutil/intervalList.H | 675 +
libutil/kazlib/Make.include | 27 +
libutil/kazlib/blast.pl | 33 +
libutil/kazlib/dict.c | 1238 +
libutil/kazlib/dict.h | 142 +
libutil/kazlib/docs/CHANGES | 290 +
libutil/kazlib/docs/MUST_READ | 25 +
libutil/kazlib/docs/README | 66 +
libutil/kazlib/docs/docs.ist | 4 +
libutil/kazlib/docs/docs.ltx | 4155 ++
libutil/kazlib/drivers/dict-main.c | 300 +
libutil/kazlib/drivers/except-main.c | 57 +
libutil/kazlib/drivers/hash-main.c | 187 +
libutil/kazlib/drivers/list-main.c | 152 +
libutil/kazlib/drivers/sfx-main.c | 41 +
libutil/kazlib/except.c | 347 +
libutil/kazlib/except.h | 147 +
libutil/kazlib/hash.c | 837 +
libutil/kazlib/hash.h | 238 +
libutil/kazlib/list.c | 766 +
libutil/kazlib/list.h | 152 +
libutil/kazlib/sfx.c | 1138 +
libutil/kazlib/sfx.h | 46 +
libutil/logMsg.H | 115 +
libutil/md5.c | 441 +
libutil/mt19937ar/Make.include | 23 +
libutil/mt19937ar/mt19937ar-test.c | 38 +
libutil/mt19937ar/mt19937ar.c | 189 +
libutil/mt19937ar/mt19937ar.h | 47 +
libutil/mt19937ar/mt19937ar.out | 736 +
libutil/mt19937ar/mt19937ar.readme | 74 +
libutil/mt19937ar/tt800.c | 64 +
libutil/palloc.c | 236 +
libutil/qsort_mt.c | 406 +
libutil/readBuffer.C | 284 +
libutil/readBuffer.H | 86 +
libutil/recordFile.C | 320 +
libutil/recordFile.H | 65 +
libutil/speedCounter.C | 61 +
libutil/speedCounter.H | 77 +
libutil/splitToWords.H | 117 +
libutil/sweatShop.C | 587 +
libutil/sweatShop.H | 81 +
libutil/test/Makefile | 106 +
libutil/test/atomic.C | 62 +
libutil/test/endianess.c | 124 +
libutil/test/order.C | 84 +
libutil/test/tcat.C | 86 +
libutil/test/test-bigQueue.C | 72 +
libutil/test/test-bitPackedArray.C | 152 +
libutil/test/test-bitPackedFile.C | 271 +
libutil/test/test-bitPacking.C | 337 +
libutil/test/test-bzipBuffer.C | 110 +
libutil/test/test-freeDiskSpace.c | 16 +
libutil/test/test-intervalList.C | 322 +
libutil/test/test-logMsg.C | 27 +
libutil/test/test-md5.c | 47 +
libutil/test/test-mmap.c | 80 +
libutil/test/test-palloc.c | 65 +
libutil/test/test-readBuffer.C | 135 +
libutil/test/test-recordFile.C | 64 +
libutil/test/test-types.c | 34 +
libutil/uint32List.H | 62 +
libutil/unaryEncoding.h | 76 +
libutil/unaryEncodingTester.C | 199 +
libutil/util++.H | 46 +
libutil/util.c | 115 +
libutil/util.h | 356 +
meryl/Make.include | 43 +
meryl/args.C | 589 +
meryl/asmMerQC-regions.C | 1023 +
meryl/asmMerQC.C | 396 +
meryl/asmMerQC.sh | 166 +
meryl/binaryOp.C | 176 +
meryl/build-threads.C | 94 +
meryl/build.C | 842 +
meryl/compare-counts.C | 233 +
meryl/dump.C | 156 +
meryl/estimate.C | 182 +
meryl/m-heap.H | 152 +
meryl/m.C | 118 +
meryl/mapMers-depth.C | 139 +
meryl/mapMers.C | 210 +
meryl/maskMers.C | 591 +
meryl/merge.C | 240 +
meryl/merge.listmerge.C | 447 +
meryl/merge.qsort.C | 471 +
meryl/mervin.C | 793 +
meryl/meryl.C | 72 +
meryl/meryl.H | 128 +
meryl/simple.C | 164 +
meryl/test/Makefile | 37 +
meryl/test/exhaustive.C | 171 +
meryl/test/kmerlite.H | 133 +
meryl/test/stupidcount.C | 38 +
meryl/test/test-seq1.fasta | 8 +
meryl/test/test-seq2.fasta | 18 +
meryl/test/test-seq3.fasta | 2 +
meryl/testPositionBias.C | 117 +
meryl/unaryOp.C | 59 +
seagen/Make.include | 54 +
seagen/aHit.C | 119 +
seagen/aHit.H | 30 +
seagen/analysis/dumpScores.pl | 50 +
seagen/analysis/plotScoresSingly.pl | 71 +
seagen/configuration.C | 345 +
seagen/configuration.H | 107 +
seagen/encodedQuery.C | 231 +
seagen/encodedQuery.H | 120 +
seagen/filterEST-complicated.C | 279 +
seagen/filterEST.C | 275 +
seagen/filterESTsimple.C | 71 +
seagen/filterMRNA.C | 94 +
seagen/filterNULL.C | 24 +
seagen/filtertest.C | 325 +
seagen/hitConverter.C | 77 +
seagen/hitMatrix-sort.C | 82 +
seagen/hitMatrix.C | 658 +
seagen/hitMatrix.H | 156 +
seagen/hitReader.C | 273 +
seagen/hitReader.H | 82 +
seagen/misc/dumpCrapSeqs.C | 42 +
seagen/misc/f.C | 137 +
seagen/misc/h.C | 105 +
seagen/posix.H | 0
seagen/searchGENOME.C | 119 +
seagen/searchGENOME.H | 29 +
seagen/searcherState.H | 31 +
seagen/sortHits.C | 297 +
seagen/test/encodedQueryTest.C | 48 +
seagen/test/intervalList-test.C | 110 +
seagen/thr-deadlock.C | 470 +
seagen/thr-loader.C | 25 +
seagen/thr-output.C | 71 +
seagen/thr-search.C | 76 +
seatac/Make.include | 41 +
seatac/configuration.C | 285 +
seatac/encodedQuery.C | 83 +
seatac/filter-heavychains.C | 315 +
seatac/filter-nop.C | 186 +
seatac/filterObj.H | 228 +
seatac/heavychains-driver.C | 183 +
seatac/heavychains.C | 191 +
seatac/heavychains.H | 462 +
seatac/hitMatrix-sort.C | 82 +
seatac/hitMatrix.C | 367 +
seatac/hitMatrix.H | 112 +
seatac/posix.H | 0
seatac/seatac.C | 269 +
seatac/seatac.H | 168 +
seatac/sharedObj.H | 72 +
seatac/statObj.H | 73 +
seatac/summarizeAtacStats.pl | 155 +
seatac/thr-deadlock.C | 77 +
seatac/thr-loader.C | 81 +
seatac/thr-search.C | 173 +
sim4db/Make.include | 17 +
sim4db/sim4th.C | 601 +
sim4dbutils/LOG | 186 +
sim4dbutils/Make.include | 120 +
sim4dbutils/README | 25 +
sim4dbutils/cleanPolishes-20020626.C | 302 +
.../dbEST-intronSize-histogram | 45484 +++++++++++++++++++
.../cleanPolishes-experiments/evalThresh-gnuplot | 23 +
.../cleanPolishes-experiments/evalThresh-plot.pl | 25 +
.../cleanPolishes-experiments/evalThresh.dat | 49 +
.../cleanPolishes-experiments/evalThresh.pl | 9 +
.../cleanPolishes-experiments/evalThresh.pl.out | 637 +
.../cleanPolishes-experiments/evalThresh.ps | 1217 +
.../cleanPolishes-experiments/intronstats.pl | 137 +
sim4dbutils/cleanPolishes.C | 503 +
sim4dbutils/comparePolishes.C | 524 +
sim4dbutils/convertPolishes.C | 57 +
sim4dbutils/convertToAtac.C | 334 +
sim4dbutils/convertToExtent.C | 132 +
sim4dbutils/coveragehack.C | 224 +
sim4dbutils/depthOfPolishes.C | 118 +
sim4dbutils/detectChimera.C | 172 +
sim4dbutils/doc.txt | 82 +
sim4dbutils/filterPolishes.C | 297 +
sim4dbutils/fixPolishesIID.C | 128 +
sim4dbutils/headPolishes.C | 63 +
sim4dbutils/mappedCoverage.C | 250 +
sim4dbutils/mergePolishes.C | 144 +
sim4dbutils/parseSNP.C | 592 +
sim4dbutils/pickBestPair.C | 599 +
sim4dbutils/pickBestPolish.C | 444 +
sim4dbutils/pickUniquePolish-nhgri.C | 713 +
sim4dbutils/pickUniquePolish.C | 382 +
sim4dbutils/plotCoverageVsIdentity.C | 48 +
sim4dbutils/plotIntronSize.C | 104 +
sim4dbutils/realignPolishes.C | 264 +
sim4dbutils/removeDuplicate.C | 142 +
sim4dbutils/removeRedundant.C | 265 +
sim4dbutils/reportAlignmentDifferences.C | 205 +
sim4dbutils/s4p_overlap.C | 41 +
sim4dbutils/s4p_overlap.H | 20 +
sim4dbutils/sortPolishes.C | 368 +
sim4dbutils/summarizePolishes.C | 253 +
sim4dbutils/test/parsesnp-correct-parsed | 8 +
sim4dbutils/test/parsesnp-gen.fasta | 10 +
sim4dbutils/test/parsesnp-snp.fasta | 32 +
sim4dbutils/trimExons.C | 210 +
sim4dbutils/trimSequencesBasedOnMatches.C | 138 +
sim4dbutils/uniqPolishes.C | 102 +
sim4dbutils/vennPolishes.C | 192 +
snapper/Make.include | 33 +
snapper/configuration.C | 275 +
snapper/eval/domap14.sh | 91 +
snapper/eval/filter-validate.pl | 60 +
snapper/hitMatrix-sort.C | 57 +
snapper/hitMatrix.C | 426 +
snapper/snapper2-sge.pl | 171 +
snapper/snapper2.C | 490 +
snapper/snapper2.H | 468 +
snapper/test/Makefile | 21 +
snapper/thr-filter.C | 126 +
snapper/thr-polish-dp.C | 489 +
snapper/thr-polish.C | 360 +
snapper/thr-search.C | 281 +
tapper/Make.include | 22 +
tapper/compare.pl | 266 +
tapper/tagger.C | 505 +
tapper/tapper.C | 1157 +
tapper/tapperAlignment.H | 46 +
tapper/tapperComputation.H | 221 +
tapper/tapperGlobalData.H | 217 +
tapper/tapperHit.H | 240 +
tapper/tapperResult.H | 580 +
tapper/tapperTag.H | 302 +
tapper/tapperThreadData.H | 73 +
tapper/tapperconvert.C | 101 +
tapper/tappererrorcorrect.C | 277 +
tapper/tappermerge.C | 60 +
tapper/tappersort.C | 308 +
trie/Make.include | 21 +
trie/trie.C | 330 +
543 files changed, 171413 insertions(+)
diff --git a/ESTmapper GSAC.pdf b/ESTmapper GSAC.pdf
new file mode 100644
index 0000000..6ff2c17
Binary files /dev/null and b/ESTmapper GSAC.pdf differ
diff --git a/ESTmapper GSAC.ppt b/ESTmapper GSAC.ppt
new file mode 100644
index 0000000..ebecfb6
Binary files /dev/null and b/ESTmapper GSAC.ppt differ
diff --git a/ESTmapper LaTeX/ESTmapper.tex b/ESTmapper LaTeX/ESTmapper.tex
new file mode 100644
index 0000000..74e2d22
--- /dev/null
+++ b/ESTmapper LaTeX/ESTmapper.tex
@@ -0,0 +1,1203 @@
+\documentclass[twoside,11pt]{book}
+\usepackage{amsmath,amssymb}
+\usepackage{moreverb}
+\usepackage{fancyheadings}
+\usepackage{ulem}
+\usepackage{parskip}
+\usepackage{calc,ifthen,epsfig}
+\sloppy
+
+%
+% a mathematican is a machine that transforms coffee into theorems
+% a software engineer is a machine that transforms sugar and caffiene into software
+% a manager is a machine that transforms people into power point
+%
+
+\usepackage{longtable}
+
+% A few float parameters
+%
+\renewcommand{\dbltopfraction}{0.9}
+\renewcommand{\dblfloatpagefraction}{0.9}
+%\renewcommand{\textfraction}{0.05}
+
+
+\begin{document}
+
+
+
+\pagestyle{fancy}
+
+\rhead[]{}
+\chead[ESTmapper]{ESTmapper}
+\lhead[\today]{\today}
+
+
+\newcommand{\ESTmapper}{{\sc ESTmapper}}
+
+\normalem
+
+%\title{ESTmapper documentation\\
+%{\small or, why algorithmists shouldn't write manuals}}
+%\author{
+%Liliana Florea\thanks{liliana.florea at celera.com},
+%Brian P. Walenz\thanks{brian.walenz at celera.com}}
+%
+%\maketitle
+
+\tableofcontents
+%\listoffigures
+%\listoftables
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Introduction}
+\label{chap:intro}
+
+\if false
+The ESTMapper is a software package designed to efficiently map large EST data
+sets to a target genome. For each cDNA (EST or full-length mRNA) sequence
+in the input set, it will determine a set of instances of the EST in the
+target genome in a three-stage process. Stage I, 'signal finding', is an
+efficient similarity search which identifies potential EST-containing regions
+in the reference genome. In Stage II, 'signal filtering', regions containing
+weak signals are removed based on the extent of the cDNA matched and the
+number of regions. Stage III, 'signal polishing' uses an
+enhanced version of Sim4 to produce spliced alignments between the
+query EST sequence and each of the remaining genomic regions.
+\fi
+
+
+\ESTmapper\ is a software package designed to efficiently map large
+cDNA data sets to a target genome.
+%
+A three-stage process is used to locate each cDNA sequence in the
+target genome.
+%
+% For each cDNA (EST or full-length mRNA) sequence
+%in the input set, \ESTmapper\ will locate the cDNA sequence in the
+%target genome in a three-stage process.
+%
+The first stage, {\em signal finding}, is an efficient sequence
+similarity search which identifies regions on the genome which could
+potentially contain the cDNA sequence.
+%potential EST-containing regions
+%in the reference genome.
+%
+The second stage, {\em signal filtering}, discards regions containing
+weak signals based on the extent of the cDNA matched and the number of
+candidate genomic regions.
+%
+The final stage, {\em signal polishing} uses an enhanced version of
+the {\tt Sim4} program to produce spliced alignments between the
+cDNA sequences and their associated genomic regions.
+
+\section{Features}
+
+\ESTmapper\ offers the following features for high-throughput mapping
+of cDNA sequences to genomic sequences:
+\begin{itemize}
+%[Input]
+\item Simple input presentation, as multi-fasta files.
+\item Requires no pre-processing of sequences (typical procedures
+include vector and quality trimming, contaminant screening, assigning
+quality values, and repeat masking).
+%[Output]
+\item Output formatted as easy-to-parse flat files.
+\item When converted to XML-feature files, the results can be viewed
+using Celera's Genome Browser, or loaded into a database.
+\item Output filtered into three user-specified quality levels
+corresponding to {\it good, full-length}, {\it good, but short} and
+{\it low quality}.
+%\item Flexible parameters for the quality of reported matches.
+%[USER INTERFACE]
+\item Choice of pre-packaged or fully customizable mapping procedures.
+%[Implementation]
+\item Parallel operation to take advantage of multi-processor environment.
+%[Algorithmics]
+\item The search stage employs a proprietary ultra-fast near-identity
+search program, which uses an efficient k-mer index to quickly
+identify match seeds.
+\item The combined search and filtering stages offer high sensivity at
+relatively low computational cost.
+\item The differential filtering for mRNA and EST sequences takes full
+advantage of their mapping characteristics to reduce the computational
+cost for polishing false positives.
+\item Efficient run-time screening for repetitive elements.
+\item Extensions and improvements to the industry-standard EST-to-genome
+alignment program Sim4:
+\begin{itemize}
+\item detection of multiple occurrences of the query in the genomic sequence
+\item improved input and output mechanisms for high-throughput processing
+of different sequences
+\item better memory management allows for processing of large sequences
+\end{itemize}
+\item Whole chromosomal sequences can be used --- no segmentation of
+the genomic sequences is necessary. Consequently, matches are not
+pruned to fit in fixed size intervals, which allows arbitrarily long
+introns.
+\end{itemize}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Installation}
+\label{chap:install}
+
+{\tt bzip2 -dc ESTmapper.tar.bz2 | tar -xf - }
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Software Overview}
+\label{chap:overview}
+
+This chapter provides an overview of the \ESTmapper\ process.
+Understanding the process will assist in operation of \ESTmapper, and
+is necessary for using the advanced modes of operation.
+
+\ESTmapper\ is comprised of five stages.
+
+\begin{tabular}{|c|l|l|}
+\hline
+Stage & Name & Description \\
+\hline
+\hline
+1 & Preparation & Prepare the input files. \\
+2 & Searching & Locate cDNA signals in the genome. \\
+3 & Filtering & Remove weak signals. \\
+4 & Polishing & Resolve signals into spliced alignments. \\
+5 & Output & Assemble the output and collect statistics. \\
+\hline
+\end{tabular}
+
+The stages are implemented so that they will not recompute a
+previously computed results. This makes it easy to chain the stages
+together in a pipeline, or to associate specific hardware with a
+stage. For example, the compute intensive stage 4 can be processed on
+a fast, but expensive computer, while the I/O intensive stage 5 can be
+processed on an inexpensive workstation.
+
+The stages are explained in the sections that follow.
+
+{\bf A CPU-hour is defined as one hour of processing on a one
+processor of a 500MHz Compaq ES40. Timing is given only to give a
+feeling for the expense of a particular stage.}
+
+\subsection*{Preparation}
+
+{\bf Prepare the input files.} This stage prepares the input to
+improve the efficiency of later stages. First, an index is built for
+each sequence file to allow random access to the sequences. Second,
+the genomic sequences are examined and grouped into approximately
+equally-sized groups to allow the search stage to execute in a specific
+memory footprint.
+
+This stage typically takes only a few minutes, and requires no
+significant resources.
+
+\subsection*{Searching}
+
+{\bf Search the genome for cDNA signals.} This stage executes the
+search algorithm for each group of sequences determined in the
+previous stage. The search algorithm uses a fast, but memory
+intensive, data structure to find all common $20$-mers between each
+cDNA sequence and each genomic sequence. Genomic regions which
+contain words consistent with an exon model are reported as potential
+cDNA containing regions.
+
+Because the search algorithm needs to use large amounts of memory to
+execute efficiently, it is multi-threaded.
+
+By default, this phase requires 4GB of main memory. For {\tt dbEST}
+size EST input, it will generate 70GB of output, and requires 50 CPU
+hours. For {\tt RefSeq} size mRNA input, it will generate {\bf XXXGB}
+of output, and requires a few CPU hours.
+
+\subsection*{Filtering}
+
+{\bf Filter the signals.} The signal filtering stage examines the
+output generated by the search, and discards regions that
+are relatively weak among the candidate regions for the same cDNA sequence.
+
+The ESTMapper implements two different protocols for filtering EST and
+full-length mRNA matches, respectively, taking into the account the
+different characteristics of the two types of sequences. ESTs are
+shorter and less accurate due to sequencing errors and contamination
+with vector sequences, and therefore will generate weaker
+imprints on the genome than full-length mRNAs. They are also more
+likely to contain repeat sequences. Consequently, a more sensitive
+filter should be used. In contrast, full-length mRNA sequences are
+longer and more accurate, and in general have fewer expected occurrences
+on the genome. This makes it easier to differentiate between the false
+positive and true signals, and therefore the filter can be more specific.
+
+All candidate regions produced by the search are scored based on the
+portion of the cDNA match they contain, and the highest scoring of
+these will be selected for polishing. For EST input, at most 100
+regions are selected for each query. If the number of regions exceeds
+this threshold, weak signals are discarded, and the procedure is
+repeated. ESTs with more than 100 candidate regions after the second
+filter are labeled as containing repetitive elements, and for these no
+regions are selected.
+
+For mRNAs, a fixed portion at the top of the scoring range is selected,
+and all regions with scores in this interval are selected in a first phase.
+In addition, all regions containing at least a fraction $p$ of the mRNA
+will be chosen.
+
+As mentioned, for EST input the \ESTmapper\ is able to detect
+repeat-containing cDNA sequences. {\bf Experiments have confirmed
+indicate that sequences flagged as such are confirmed by RepeatMasker.
+-- do we want to include some evidence?}
+
+This stage requires three CPU hours, and four wall-clock hours to
+process {\tt dbEST} size EST input. It does not have any significant
+memory requirements, however, it makes heavy use of disk.
+
+It generates 6GB of output.
+
+After this phase completes, the full output of the search is no longer
+needed, and can be removed.
+
+\subsection*{Polishing}
+
+{\bf Polish the filtered signals.} This stage applies the {\tt Sim4}
+algorithm to each cDNA-genomic region to generate a spliced-alignment.
+The output presents in a condensed form information about the
+boundaries of exons and introns in the two sequences, predicted intron
+orientations, sequence similarity scores for the global and for the
+individual exons' alignments, and other sequence and alignment
+statistics. The complete list is reviewed in
+Section~\ref{subsec:matchformat}.
+
+{\bf NEEDS WORK ON MEMORY USAGE! How much? When?}
+
+{\tt dbEST} size EST input requires approximately 600 CPU hours, and
+generates 10GB of output. When processing large (more than 120Mb) genomic
+sequences, each {\tt Sim4} process can use over 1GB of memory.
+
+{\tt RefSeq} size mRNA inputs can be polished in a few CPU hours.
+
+\subsection*{Output}
+
+{\bf Process the output.} This stage collects the
+output from the polishing stage, and performs a final quality-based
+filtering of the matches and the cDNA.
+
+Matches are classified as ``good'', ``good, but short'' or
+``low-quality'' based on the two statistics: {\em query-sequence
+identity}, and {\em alignment-sequence identity}.
+
+\begin{tabular}{|p{1.7in}|p{3.0in}|}
+\hline
+query-sequence identity & the percentage of nucleotides in the cDNA, excluding the polyA(T) tails, exactly matching the genomic sequence \\
+alignment-sequence identity & the percentage of nucleotide matches in the spliced alignment \\
+\hline
+\end{tabular}
+
+The ESTMapper will generate all spliced alignments of matches
+identified in the search stage which have at least $p$ percent
+alignment-sequence identity, and at least $c$ percent query-sequence
+identity.
+
+\begin{tabular}{|p{1.7in}|p{3.0in}|}
+\hline
+good & believed with high confidence \\
+good, but short & would be believed with high confidence, except that
+ only a small piece of the cDNA sequence matched \\
+low-quality & a match was reported from the polishing stage, but
+ the percent alignment-sequence identity is low \\
+\hline
+\end{tabular}
+
+Note that it is possible for a cDNA sequence to have matches in any
+number of categories, for example, the true match would be labeled as
+``good'', a partial match would be placed as ``good, but short'', and
+a paralogous match might be placed in ``low-quality''.
+
+cDNA sequences are classified as ``good'', ``good, but short'',
+``low-quality'', ``missing'', or ``zero'', based on the quality of the
+best match for the cDNA, the lack of a match, or the lack of a cDNA
+signal, respectively.
+
+\begin{tabular}{|p{1.7in}|p{3.0in}|}
+\hline
+good & the best match for these sequences is classified as ``good'' \\
+good, but short & the best match for these sequences is classified as ``good, but short'' \\
+low-quality & the best match for these sequences is classified as ``low-quality'' \\
+missing & signals were detected, but polishing did not generate
+ any matches. Had a match been produced, they would have been
+ ``very low-quality''. \\
+zero & no signals were detected. These cDNA sequences are probably not
+ present on the genome. \\
+\hline
+\end{tabular}
+
+Unlike the classification of matches, each cDNA is classified into exactly one category.
+
+Section~\ref{sec:quality} discusses quality.
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Getting Started}
+\label{chap:start}
+
+{\bf
+Example of running an est search, example of an mrna search (using the
+same genomic). Layout of the directories.
+}
+
+Is the label a useful concept? Why not just use a full directory for
+each mapping. Simpler. The only thing we gain with labels is that we
+skip configuration, which takes minutes anyway. Where to put the
+config files? genome partition can go into search, and just leave the
+genome sequence in the same place.
+
+%
+% Liliana suggested to put the example in the distribution, then to
+% refer to the files. I disagree; you can't read it offline. We
+% should still provide some sample data for playing around.
+%
+
+\section{Small-scale EST Mapping}
+
+In this section we demonstrate \ESTmapper by mapping a small set of ESTs to a 1Mb genomic region.
+
+%{\tt ESTmapper.pl -mapest /dev5/walenz/FY
+% /dev5/walenz/ESTs/dbEST\_human\_01.fasta
+% /dev5/walenz/SCF/FY.fasta}
+
+The \ESTmapper\ command line
+\small
+\begin{verbatim}
+ESTmapper.pl -mapest /dev5/walenz/FY \
+ /dev5/walenz/ESTs/dbEST_human_01.fasta \
+ /dev5/walenz/SCF/FY.fasta
+\end{verbatim}
+\normalsize
+says to map ESTs, creating the directory {\tt /dev5/walenz/FY} for
+work and ouptut files, reading ESTs from {\tt
+/dev5/walenz/ESTs/dbEST\_human\_01.fasta} and genomic sequences from
+{\tt /dev5/walenz/SCF/FY.fasta}.
+
+This particular EST set contains about {\bf 30,000} ESTs. The
+genomic sequence is a {\bf 1Mb} scaffold.
+
+The screen output from \ESTmapper\ is explained next.
+
+\footnotesize
+\begin{verbatim}
+ESTmapper: Performing a configure.
+ESTmapper/configure-- Use about 3800MB -> 398458880 bases per chunk.
+ESTmapper/configure-- Generating the info for '/dev5/walenz/FY/0-input/genomic.fasta'
+ESTmapper/configure-- WARNING: This is done in the work directory!
+ESTmapper/configure-- Created group with 590724 bases.
+\end{verbatim}
+\normalsize
+
+\ESTmapper\ is performing its configure phase. It is grouping genomic sequences
+into groups with no more than 398,458,880 bases, and it estimates that the
+search process will require about 3800MB to compute.
+
+The warning refers to the fact that the index files for the genomic sequences
+do not already exist, and that they will be created and stored in the work
+directory, not with the original file.
+
+\footnotesize
+\begin{verbatim}
+ESTmapper: Performing a search.
+ESTmapper/search-- Local mode requested; 1 processes.
+ESTmapper/search-- search 000
+ESTmapper: searchGENOME required 1.792912 seconds system time.
+ESTmapper: searchGENOME required 40.95784 seconds user time.
+ESTmapper: Search script finished in 19 wall-clock seconds.
+\end{verbatim}
+\normalsize
+
+\ESTmapper\ is performing the search phase. This input requires only
+one search process. Time statistics are reported. The search
+algorithm used about 43 seconds of CPU time, and the entire search
+phase took 19 wall-clock seconds. The search algorithm is capable of
+using multiple processors, which explains why it used more CPU time
+than wall-clock time.
+
+\footnotesize
+\begin{verbatim}
+ESTmapper: Performing a filter.
+ESTmapper/search-- Merging counts.
+ESTmapper/search-- Writing counts.
+\end{verbatim}
+\normalsize
+
+As each search process outputs the number of signals detected for each
+cDNA sequence, after all search processes finish, these counts are
+merged together, for use in the filtering phase.
+
+\footnotesize
+\begin{verbatim}
+ESTmapper/filter-- Filtering.
+ESTmapper/filterEST-- uniqThresh= 100 reptThresh= 100 qualityThresh=0.20
+ESTmapper/filterEST-- UNIQ: 3996( 9979) FILT: 0( 0/ 0) REPT: 0( 0/ 0)
+ESTmapper/filter-- Sorting.
+\end{verbatim}
+\normalsize
+
+\ESTmapper\ is now filtering the signals. Filtering is explained in
+detail in Chapter~/ref{chap:filtering}.
+
+\footnotesize
+\begin{verbatim}
+ESTmapper: Performing a polish.
+ESTmapper/polish-- Creating scripts with 500 lines in each.
+ESTmapper/polish-- Created 020 scripts.
+ESTmapper/polish-- Running locally, 4 at a time.
+ESTmapper: sim4db required 188.127737 seconds wall-clock time.
+ESTmapper: sim4db required 18.927568 seconds system time.
+ESTmapper: sim4db required 112.960288 seconds user time.
+ESTmapper: Polish script finished in 50 wall-clock seconds.
+\end{verbatim}
+\normalsize
+
+\ESTmapper\ is performing the {\tt Sim4} polishing of signals. It
+creates 20 batches, with each batch containing 500 signals to
+process\footnote{Yes, except probably for the last one.}. The
+polishing is run on the local hardware, using four processors.
+
+Like the search phase, the polishing phase reports statistics on
+the time used. In this example, the {\tt Sim4} processes needed a
+total of 188 wall clock seconds, and 130 CPU seconds. The polishing
+stage required 50 wall-clock seconds.
+
+\footnotesize
+\begin{verbatim}
+ESTmapper: Performing an assembleOutput.
+ESTmapper/assembleOutput-- WARNING: 'short' quality levels too low for existing polishing!
+ESTmapper/assembleOutput-- WARNING: Polished at percent query-sequence identity = 45, requested filtration at 0.
+ESTmapper/assembleOutput-- WARNING: Polished at percent align-sequence identity = 85, requested filtration at 95.
+ESTmapper/assembleOutput-- filtering polishes by quality.
+ESTmapper/assembleOutput-- GOOD: percent query-sequence identity: 50
+ESTmapper/assembleOutput-- GOOD: percent align-sequence identity: 95
+ESTmapper/assembleOutput-- SHORT: percent query-sequence identity: 0
+ESTmapper/assembleOutput-- SHORT: percent align-sequence identity: 95
+ESTmapper/assembleOutput-- finding 'good' cDNA.
+ESTmapper/assembleOutput-- finding 'good, but short' cDNA.
+ESTmapper/assembleOutput-- finding 'low quality' cDNA.
+ESTmapper/assembleOutput-- finding 'repeat' cDNA.
+ESTmapper/assembleOutput-- finding 'zero hit' cDNA.
+ESTmapper/assembleOutput-- finding 'missing' cDNA.
+ESTmapper/assembleOutput-- counting 'good' matches.
+ESTmapper/assembleOutput-- counting 'good, but short' matches.
+ESTmapper/assembleOutput-- counting 'all the good' matches.
+ESTmapper/assembleOutput-- counting 'low quality' matches.
+ESTmapper/assembleOutput-- counting cDNA.
+ESTmapper: assembleOutput script finished in 52 wall-clock seconds.
+\end{verbatim}
+\normalsize
+
+\ESTmapper\ is processing the output from the polishing, classifying
+matches and cDNA. We can safely ignore the warning; it is telling us
+that even though we requested filtration down to 0\% query-sequence
+identity, polishes only exist down to 45\%.
+
+\footnotesize
+\begin{verbatim}
+ESTmapper: script finished everything in 122 wall-clock seconds.
+\end{verbatim}
+\normalsize
+
+Finally, the script is completed. Our mapping finished in a little
+under three minutes.
+
+The contents of the output directory are explained in
+Chapter~\ref{chap:output}, but we'll quickly peek at the
+statistics contained in the {\tt summary} file:
+
+\footnotesize
+\begin{verbatim}
+GOOD:
+cDNA-genomic matches 51 matches (51 different cDNA and 1 genomic)
+Matches per cDNA 1 matches/cDNA
+Matches per genomic 51 matches/genomic
+
+GOOD but SHORT:
+cDNA-genomic matches None.
+
+ALL THE GOOD: (both 'GOOD' and 'GOOD but SHORT')
+cDNA-genomic matches 51 matches (51 different cDNA and 1 genomic)
+Matches per cDNA 1 matches/cDNA
+Matches per scaffold 51 matches/genomic
+
+LOW-QUALITY:
+cDNA-genomic matches 2712 matches (564 different cDNA and 1 genomic)
+Matches per cDNA 4.80851063829787 matches/cDNA
+Matches per scaffold 2712 matches/genomic
+
+cDNA COUNTS:
+cDNA: 39182
+cDNA-good: 51
+cDNA-goodshort: 0
+cDNA-lowquality: 563
+cDNA-missing: 3382
+cDNA-zero: 35186
+\end{verbatim}
+\normalsize
+
+This is fully described in Section~\ref{sec:summary}.
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Input Files}
+\label{chap:input}
+
+cDNA and genomic sequences are read from two multi-FastA format file.
+
+EST-type and full-length mRNA-type sequences are processed differently,
+therefore the input data should be a priori separated in files by these
+categories.
+
+It is {\em not} necessary to repeat mask the sequences. \ESTmapper's mechanism
+will be able to identify and separate those sequences that contain highly
+repetitive elements.
+
+It is {\em not} necessary to fragment the genomic sequences. \ESTmapper\
+can process full-length chromosomal sequences, which allows it to identify
+maximal cDNA matches even when long introns are present.
+
+It is {\em not} necessary to quality- and vector-trim the EST sequences, but
+doing so will increase the accuracy of the match statistics. In principle, the
+quality thresholds used for validating and classifying the matches include a
+margin of error that could account for the effects of such factors.
+
+\section{High Frequency $k$-Mer Masking}
+
+The search phase in \ESTmapper\ ignores $k$-mers that occur at least
+1000 times in the genomic sequence.
+
+Computing the list of $k$-mers to ignore requires large amounts of
+memory and CPU, and is {\em not} performed by \ESTmapper. Lists
+appropriate for human and mouse are provided as data files. See the
+{\tt -maskmers} entry in Advanced Usage.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Output Files}
+\label{chap:output}
+
+The work directory contains several files and directories:
+
+\setlongtables
+\begin{longtable}{|l|p{3.4in}|}
+\hline
+File Name & Description \\
+\hline
+\hline
+\endhead
+\hline
+\endfoot
+0-input & Symbolic links to input files, and any indices and other files needed to later stages. \\
+\hline
+1-search & The temporary work directory for the search step. The contents are undocumented. \\
+\hline
+2-filter & The temporary work directory for the filter step. The contents are undocumented.\\
+\hline
+3-polish & The temporary work directory for the polishing step. The contents are undocumented.\\
+\hline
+cDNA-good.fasta & These cDNA sequences were successfully mapped.\\
+\hline
+cDNA-goodshort.fasta & These cDNA sequences were mapped at the correct percent alignment-sequence identity, but below the desired percent query-sequence identity.\\
+\hline
+cDNA-lowquality.fasta & These cDNA sequences were mapped, but at low percent query- and alignment-sequence identities.\\
+\hline
+cDNA-missing.fasta & These cDNA sequences had at least one signal detected, but the signal(s) were spurious. The polishing step did not find a match.\\
+\hline
+cDNA-repeat.fasta & These cDNA sequences were classified as repeat-containing by the filter.\\
+\hline
+cDNA-zero.fasta & These cDNA sequences had no signals detected.\\
+\hline
+polishes-good & All the ``good'' polishes.\\
+\hline
+polishes-goodshort & All the ``good, but short'' polishes.\\
+\hline
+polishes-lowquality & All the remaining polishes.\\
+\hline
+summary & A summary of the mapping.\\
+\end{longtable}
+
+\subsection{Summary File}
+\label{sec:summary}
+
+The {\tt summary} file describes the results of the mapping.
+
+\footnotesize
+\begin{verbatim}
+GOOD: 50% composite, 95% identity
+cDNA-genomic matches 4028809 matches (3060666 different cDNA and 5377 genomic)
+Matches per cDNA 1.31631775567801 matches/cDNA
+Matches per genomic 749.267063418263 matches/genomic
+
+GOOD but SHORT: 0% composite, 95% identity
+cDNA-genomic matches 26825 matches (22017 different cDNA and 1614 genomic)
+Matches per cDNA 1.21837670890675 matches/cDNA
+Matches per genomic 16.6201982651797 matches/genomic
+
+ALL THE GOOD: (both 'GOOD' and 'GOOD but SHORT')
+cDNA-genomic matches 4055634 matches (3071297 different cDNA and 5461 genomic)
+Matches per cDNA 1.32049554308815 matches/cDNA
+Matches per scaffold 742.654092657022 matches/genomic
+
+LOW-QUALITY:
+cDNA-genomic matches 7664890 matches (1263273 different cDNA and 6054 genomic)
+Matches per cDNA 6.06748501709448 matches/cDNA
+Matches per scaffold 1266.08688470433 matches/genomic
+
+cDNA COUNTS:
+cDNA: 3992939
+cDNA-good: 3060666
+cDNA-goodshort: 10631
+cDNA-lowquality: 433295
+cDNA-missing: 440037
+cDNA-zero: 48310
+\end{verbatim}
+\normalsize
+
+cDNA sequences are classified into one of six categories: {\tt good},
+{\tt goodshort}, {\tt lowquality}, {\tt missing}, {\tt repeat}, or
+{\tt zero}. Each cDNA in the input is in exactly one of the
+categories.
+
+\begin{tabular}{|l|p{4.3in}|}
+\hline
+category & description \\
+\hline
+\hline
+good & A match exists that meets both query-sequence identity and alignment-sequence identity requirements. \\
+goodshort & A match exists that meets both query-sequence identity and alignment-sequence identity requirements, for ``short''. \\
+lowquality & A match exists, but it does not meet at least one quality requirement. \\
+missing & A signal was detected, but no match was produced. \\
+repeat & Many signals were detected, and the filter stage declared this cDNA to be repeat-containing. \\
+zero & No signals were detected for this cDNA. \\
+\hline
+\end{tabular}
+
+Likewise, matches generated by the polishing stage are placed into
+three categories: {\tt good}, {\tt goodshort} or {\tt lowquality}.
+This is done match by match, so it is possible to have matches for a
+specific cDNA sequence in all three categories.
+
+For each category, the {\tt summary} file counts the number of matches
+it contains, the number of distinct cDNA / genomic sequences used by
+those matches, and the matches per cDNA or genomic.
+
+\subsection{Polished Match Format}
+\label{subsec:matchformat}
+
+The files {\tt polishes-good}, {\tt polishes-goodshort}, and {\tt
+polishes-lowquality} contain the results of the polishing stage. All
+matches are placed in the same file. Each match starts with the line
+{\tt sim4begin}, and ends with the line {\tt sim4end}. Matches have
+the following format:
+
+\begin{tabular}{c|l}
+1 & {\tt sim4begin} \\
+2 & {\it cDNAidx}{\tt [}{\it cDNAlen}{\tt -}{\it pA}{\tt -}{\it pT}{\tt ]} {\it GENidx}{\tt [}{\it GENlo}{\tt -}{\it GENhi}{\tt ]} {\tt <}{\it M}{\tt -}{\it N}{\tt -}{\it P}{\tt -}{\it O}{\tt -}{\it S}{\tt >} \\
+3 & {\tt edef=}{\it cDNA defline} \\
+4 & {\tt ddef=}{\it Genomic defline} \\
+5 & {\it cDNAbgn}{\tt -}{\it cDNAend} {\tt (}{\it GENbgn}{\tt -}{\it GENend}{\tt )} {\tt <}{\it M}{\tt -}{\it N}{\tt -}{\it P}{\tt >} {\it intonOrientation} \\
+ & . \\
+ & . \\
+6 & {\it cDNAbgn}{\tt -}{\it cDNAend} {\tt (}{\it GENbgn}{\tt -}{\it GENend}{\tt )} {\tt <}{\it M}{\tt -}{\it N}{\tt -}{\it P}{\tt >} {\it intonOrientation} \\
+7 & {\it cDNAbgn}{\tt -}{\it cDNAend} {\tt (}{\it GENbgn}{\tt -}{\it GENend}{\tt )} {\tt <}{\it M}{\tt -}{\it N}{\tt -}{\it P}{\tt >} \\
+8 & {\it cDNA alignment sequence for exon \#1} \\
+9 & {\it genomic alignment sequence for exon \#1} \\
+ & . \\
+ & . \\
+ & . \\
+ & . \\
+10 & {\it cDNA alignment sequence for exon \#n} \\
+11 & {\it genomic alignment sequence for exon \#n} \\
+12 & {\tt sim4end}
+\end{tabular}
+
+Line 1 begins the match description.
+
+Line 2 contains the match description line. The fields have the following meanings:
+
+\begin{tabular}{|l|l|}
+\hline
+Field & Description \\
+\hline
+\hline
+cDNAidx & Internal index of the cDNA sequence used. \\
+cDNAlen & Length of the cDNA sequence. \\
+pA & Amount of poly-A masking performed. \\
+pT & Amount of poly-T masking performed. \\
+GENidx & Internal index of the genomic sequence used. \\
+GENlo & Beginning position of the genomic region that was polished. \\
+GENhi & Ending position of the genomic region that was polished. \\
+M & Number of matching bases in the match. \\
+N & Number of matching N's in the match. \\
+P & Percent sequence identity of the match. \\
+O & Orientation of the match. \\
+S & Strand this match is predicted to occur on. \\
+\hline
+\end{tabular}
+
+%{\tt M} and {\tt N} are the number of matches, and the number of
+%non-ACGT matches, respectively. {\tt P} is the percent sequence
+%similarity for this exon.
+
+The {\it match orientation} is {\tt forward} when the cDNA sequence
+aligns to the genomic sequence directly. It is {\tt complement} when
+the reverse-complement of the cDNA sequence matches the genomic
+sequence. These are the only two values possible.
+
+The {\it strand prediction} is either {\tt forward}, {\tt
+reverse} or {\tt unknown}. It is \ESTmapper's best guess which strand
+the cDNA is on, based on the quality of the match and the intron
+signals.
+
+Lines 3 and 4 contain the entire defline for the two sequences. These lines are
+optional.
+
+Lines 5, 6 and 7 the {\tt Sim4} exon lines. There will be one line
+for each exon found. The fields have the following meanings:
+
+\begin{tabular}{|l|l|}
+\hline
+Field & Description \\
+\hline
+\hline
+cDNAbgn & Beginning of the exon, in the cDNA sequence. \\
+cDNAend & End of the exon, in the cDNA sequence. \\
+GENbgn & Beginning of the exon, in the genomic sequence. \\
+GENend & End of the exon, in the genomic sequence. \\
+M & Number of matching bases in the exon. \\
+N & Number of matching N's in the exon. \\
+P & Percent identity of the exon. \\
+intronOrientation & Predicted orientation of the intron. \\
+\hline
+\end{tabular}
+
+Coordinates in the exon are nucleotide-based. Coordinates in the
+genomic sequence are relative to the {\tt GENlo} value from the match
+description line. The true location of the exon in the genomic sequence is
+{\tt GENlo + GENbgn} and {\tt GENlo + GENend}.
+
+The {\it intron orientation} is one of {\tt ->}, {\tt <-}, {\tt --},
+or {\tt ==}, representing forward, reverse, ambiquous, and
+internal gap in cDNA, respectively. All exons, except the last,
+contain the intron orientation field.
+
+When requested, pairwise alignments between the cDNA sequence and the
+genomic sequence within each exon follow the exon level descriptions
+(see Section~\ref{sec:polish}). In the alignments, dashes ({\tt -})
+are used to represent insertion/deletion. Lower-case characters
+represent a match, while upper-case characters represent a non-match.
+
+Line 12 closes the match descrption.
+
+\subsection{Examples}
+
+A few examples of match output are shown. {\bf should explain the
+examples more; picture?}
+
+{\bf Example 1:} A minimal match description. The deflines and
+alignments are not present.
+
+\footnotesize
+\begin{verbatim}
+sim4begin
+54[484-0-0] 0[0-590724] <477-0-98-forward-forward>
+1-96 (454213-454308) <92-0-94> ->
+97-266 (455410-455579) <170-0-100> ->
+267-377 (458098-458208) <111-0-100> ->
+378-465 (458297-458384) <88-0-100> ->
+466-484 (514282-514297) <16-0-84>
+sim4end
+\end{verbatim}
+\normalsize
+
+{\bf Example 2}: This is the same match as above, but was generated by
+limiting the genomic sequence to the range 430000 through 520000
+
+\footnotesize
+\begin{verbatim}
+sim4begin
+54[484-0-0] 0[430000-520000] <477-0-98-forward-forward>
+1-96 (24213-24308) <92-0-94> ->
+97-266 (25410-25579) <170-0-100> ->
+267-377 (28098-28208) <111-0-100> ->
+378-465 (28297-28384) <88-0-100> ->
+466-484 (84282-84297) <16-0-84>
+sim4end
+\end{verbatim}
+\normalsize
+
+{\bf Example 3:} A full match description. Deflines have been trimmed
+to fit on the page, and alignment lines are wrapped.
+
+\footnotesize
+\begin{verbatim}
+sim4begin
+618[453-0-26] 482[450000-460000] <425-0-99-complement-forward>
+edef=>CRA|70647962 /altid=gi|6798356 /dataset=dbest /taxon=9606 ...
+ddef=>CRA|GA_x2HTBKM80FY:1..590724 /organism=Homo sapiens ...
+1-71 (4238-4308) <71-0-100> ->
+72-241 (5410-5579) <170-0-100> ->
+242-352 (8098-8208) <110-0-99> ->
+353-427 (8297-8371) <74-0-98>
+tcatgaaacctgggaaggtggtgcttgtcctggctggacgctactccggacgcaaagctgtcatcgtgaag
+tcatgaaacctgggaaggtggtgcttgtcctggctggacgctactccggacgcaaagctgtcatcgtgaag
+aacattgatgatggcacctcagatcgcccctacagccatgctctggtggctggaattgaccgctacccccgcaaa \
+ gtgacagctgccatgggcaagaagaagatcgccaagagatcaaagataaaatcttttgtgaaagtgtataact \
+ acaatcacctaatgcccacaag
+aacattgatgatggcacctcagatcgcccctacagccatgctctggtggctggaattgaccgctacccccgcaaa \
+ gtgacagctgccatgggcaagaagaagatcgccaagagatcaaagataaaatcttttgtgaaagtgtataact \
+ acaatcacctaatgcccacaag
+gtactctgtggatatccccttggacaaaactgtcgtcaataaggatgtcttcaNagatcctgctcttaaacgcaa \
+ ggcccgacgggaggccaaggtcaagtttgaagagag
+gtactctgtggatatccccttggacaaaactgtcgtcaataaggatgtcttcaGagatcctgctcttaaacgcaa \
+ ggcccgacgggaggccaaggtcaagtttgaagagag
+atacaagacaggcaagaacaagtggttcttccagaaactgcggttttagatgctttgttttgaNcattaaaaatt
+atacaagacaggcaagaacaagtggttcttccagaaactgcggttttagatgctttgttttgaTcattaaaaatt
+sim4end
+\end{verbatim}
+\normalsize
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Basic Usage}
+\label{chap:basic}
+
+Using the \ESTmapper\ in automatic mapping mode is the simplest mode of operation.
+
+To map ESTs to a genome:
+
+{\tt ESTmapper.pl -mapest} {\it work-directory} {\it ests.fasta} {\it genomic.fasta}
+
+Options implicit in {\tt -mapest}:
+
+\begin{tabular}{|ll|}
+\hline
+{\tt -directory} & {\it work-directory} \\
+{\tt -cdna} & {\it ests.fasta} \\
+{\tt -genomic} & {\it genomic.fasta} \\
+{\tt -mersize} & 20 \\
+{\tt -maskmers} & {\it install-directory}{\tt /data/frequentMers-C4-20.fasta} \\
+{\tt -mincoverage} & 45 \\
+{\tt -minidentity} & 85 \\
+{\tt -local} & 4 \\
+{\tt -good} & 50 95 \\
+{\tt -goodshort} & 0 95 \\
+\hline
+\end{tabular}
+
+To map mRNA to a genome:
+
+{\tt ESTmapper.pl -mapmrna} {\it work-directory} {\it mrna.fasta} {\it genomic.fasta}
+
+\begin{tabular}{|ll|}
+\hline
+{\tt -directory} & {\it work-directory} \\
+{\tt -cdna} & {\it ests.fasta} \\
+{\tt -genomic} & {\it genomic.fasta} \\
+{\tt -mersize} & 20 \\
+{\tt -maskmers} & {\it install-directory}{\tt /data/frequentMers-C4-20.fasta} \\
+{\tt -mincoverage} & 45 \\
+{\tt -minidentity} & 85 \\
+{\tt -relink} & 1000 \\
+{\tt -abort} & \\
+{\tt -local} & 4 \\
+{\tt -good} & 50 95 \\
+{\tt -goodshort} & 0 95 \\
+\hline
+\end{tabular}
+
+Be sure that the multi-FastA files are stored on a disk local to the
+machine --- it will work if the sequences are accessed over NFS, but
+performance might suffer.
+
+The automatic mapping modes may be customized by using the options
+listed in Section~\ref{sec:adv}. For example, {\tt ESTmapper.pl
+-mapest} {\it work-directory} {\it ests.fasta} {\it genomic.fasta}
+{\tt -numcpus 2} {\tt -memory 2000} will {\bf probably -- should test}
+allow the \ESTmapper\ to run on a two processor machine with 2GB of
+RAM.
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapter{Advanced Usage}
+\label{sec:adv}
+
+\ESTmapper\ recognizes seven commands, and has one required argument. The general
+usage is:
+
+{\tt ESTmapper.pl {\it command} -directory} {\it work-directory} [{\it options}]
+
+The seven commands:
+
+\begin{tabular}{ll}
+-configure &-- prepare the genomic sequences for searching \\
+-searchest &-- perform signal finding on EST sequences \\
+-searchmrna &-- perform signal finding on mRNA sequences \\
+-filterest &-- filter EST signals \\
+-filtermrna &-- filter mRNA signals \\
+-polish &-- polish filtered signals \\
+-assembleoutput &-- prepare the output \\
+\end{tabular}
+
+The one required argument:
+
+\begin{tabular}{lp{3.0in}}
+-directory {\it /full/path/to/work/directory} &
+The \ESTmapper\ will use the supplied directory as it's work directory.
+This option must be present for all steps.
+\end{tabular}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{configure}
+\label{sec:configure}
+
+This step creates the work directory, and prepares the genomic sequence
+for mapping.
+
+\begin{tabular}{lp{3.0in}}
+
+-configure &
+Instruct the \ESTmapper\ to perform the {\tt configure} command. \\
+
+-genomic {\it g.fasta} &
+The sequences in {\tt g.fasta} will be used as the genomic sequence.
+The file is a multi-FastA format, and all sequences are used. There
+are no special requirements for the format of the defline, nor are
+there limits on the length or number of sequences.\\
+
+-memory {\it n} &
+The sequences in {\tt g.fasta} will be partitioned into sets so that
+the search phase will use no more than {\it n} MB of memory per process.
+
+Any sequences in {\tt g.fasta} that are larger than the partition
+size, are placed into a set containing one sequence. A warning is
+printed for such sequences.
+
+{\bf memory usage computation is not rigorously tested; it works for 4000}
+
+The memory usage is approximately 10 bytes per base of genomic sequence.
+
+\end{tabular}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{searchest and searchmrna}
+\label{sec:search}
+
+This step performs the search.
+
+\begin{tabular}{lp{3.0in}}
+
+-searchest {\it label} &
+Instruct the \ESTmapper\ to perform the {\tt search} stage, using
+parameters appropriate for EST sequences. The {\tt label} marks this
+run; it is possible to search multiple sets of sequences against the
+same genomic database without multiple {\tt configure} steps. The
+benefit of this is marginal.
+\\
+-searchmrna {\it label} &
+Instruct the \ESTmapper\ to perform the {\tt search} stage, using
+parameters appropriate for mRNA sequencess. See {\tt searchest} for
+discussion of {\tt label}.
+\\
+-cdna {\it c.fasta} &
+The sequences in {\tt c.fasta} will be searched. The file is a
+multi-FastA format, and all sequences are used. There are no
+special requirements for the format of a defline, nor are there limits
+on the length or number of sequences.
+
+Attempting to map sequences that are not of the specified type (mapping
+mRNA with EST parameters; mapping non-coding genomic sequence with
+{\em any} parameters) is not advised.
+
+The sequences {\em MUST NOT} be repeat masked.
+\\
+-mersize {\it m} &
+Instructs the search to use $m$ for the size of the exact-match
+blocks. A value of $m=20$ seems to be optimal; larger values use more
+memory, run faster, and are less sensitive. Smaller values use less
+memory, run slower and result in fewer signals due to spurious
+matches.
+
+Note that in automatic mapping mode, changing the mersize without
+explicitly specifying the maskmers file is an error.
+\\
+-maskmers {\it m.fasta} &
+The sequences in {\tt m.fasta} are used to build a list of mers that
+will be discarded from any matches. While this is generally
+considered to be a ``Poor Man's RepeatMasker'', the \ESTmapper\
+achieves better performance and sensitivity using this strategy than
+with full-blown repeat masking.
+
+The sequences are usually in the {\tt data} subdirectory of the
+\ESTmapper\ installation.
+
+\end{tabular}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{filterest and filtermrna}
+\label{sec:filter}
+
+This step filters the signals detected in the search phase, discarding
+weak signals and keeping strong signals.
+
+There are no user-tunable parameters at the present time.
+
+\begin{tabular}{lp{3.0in}}
+
+-filterest label &
+Instruct the \ESTmapper\ to perform the {\tt search} stage, using
+parameters appropriate for EST sequencess. The {\tt label} must
+be the same as used in the search stage.
+\\
+-filtermrna label &
+Instruct the \ESTmapper\ to perform the {\tt search} stage, using
+parameters appropriate for mRNA sequencess. The {\tt label} must
+be the same as used in the search stage.
+
+\end{tabular}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{polish}
+\label{sec:polish}
+
+This stage prepares, and optionally performs, the polishing of the
+signals detected.
+
+\begin{tabular}{lp{3.0in}}
+
+-polish {\it label} &
+Instruct the \ESTmapper\ to perform the {\tt polish} stage. The {\tt
+label} must be the same as used in the search stage.
+\\
+-mincoverage {\it mc} &
+Polishing is performed so that all results have a {\it percent query-sequence
+identity} of at least {\tt mc}. See the discussion about quality below.
+\\
+-minidentity {\it mi} &
+Polishing is performed so that all results have an {\it
+percent alignment-sequence identity} of at least {\tt mi}. See the discussion about quality below.
+\\
+-alwaysprint {\it ap} &
+The polisher will always print at least the top $ap$ matches, no
+matter what their quality. This can be used to find both full-length
+matches and the top scoring matches for signals without full-length
+matches. See the discussion about quality below.
+\\
+-relink {\it r} &
+Sets the {\it Sim4} relink weight. {\bf This is used only for mRNA, and Liliana should probably explain
+what it does.}
+\\
+-batchsize {\it w} &
+Signals are processed in batches of size $w$. See the discussion about execution below.
+\\
+-numbatches {\it b} &
+Signals are processed in $b$ batches. See the discussion about execution below.
+\\
+-farm {\it queue} {\it projectname} &
+Runs the polishes on the Celera internal compute farm. The {\tt projectname}
+should be the resource code to use, e.g. {\tt 00006:MRNA:L}. See the
+discussion about execution below.
+\\
+-local {\it numprocessors} &
+Runs the polishes on the local machine, using $numprocessors$ concurrent
+processes. See the discussion about execution below.
+\\
+-runlater &
+The \ESTmapper\ will generate all the script files needed to perform
+the polishing step, but will not perform any computation.
+\\
+-aligns &
+Instruct {\tt Sim4} to also include the alignment lines. This will
+make your output files VERY large.
+\\
+-abort &
+Instruct {\tt Sim4} to abort polishing any matches with an unusually large number
+of MSPs. The aborted matches are saved for later examination. {\bf Need to explain the output format for this!}
+
+\end{tabular}
+
+\subsection{Polishing Quality}
+\label{sec:quality}
+
+In the absence of {\tt -minidentity} and {\tt -mincoverage} and {\tt
+-alwaysprint} only the best match is found for each signal.
+
+The result of specifying exactly one of {\tt -minidentity} and {\tt
+-mincoverage} is undefined. Always specify both, even if one is $0$.
+
+Specifying {\tt -alwaysprint} without either {\tt -minidentity} and
+{\tt -mincoverage} will print the best $ap$ matches.
+
+The effect of specifying values {\tt -minidentity} and {\tt
+-mincoverage} and {\tt -alwaysprint} is almost the same as asking for
+``the good matches, unless none are found, then only the best''. This
+overcomes the problem {\bf XXX: what problem?} encountered when attempting to map mRNA
+sequences at very low percent query-sequence identities. mRNA that have
+excellent full-length matches can also have many inferrior paralogous
+(partial-)matches. If {\tt -mincoverage 10} is specified, all
+paralogous matches would be output, in addition to the true
+match. On the otherhand, for mRNA without full-length matches, we
+still want to find partial matches, even if they have a query-sequence
+identity of, say, 20\%.
+
+Polishing for a single signal terminates when any of three conditions is met:
+
+\begin{tabular}{l}
+$coverage < mc$ \\
+$identity < mc$ \\
+$printed > ap$
+\end{tabular}
+
+\subsection{Execution}
+
+The output from the filter step is a large list of signals (4
+million human ESTs produce about 60 million filtered signals).
+To run these efficiently, they are divided into batches. If {\tt
+-batchsize} is specified, then each batch will contain exactly $w$
+signals (except for the last batch), otherwise, {\tt -numbatches b}
+batches are formed.
+
+It neither of {\tt -batchsize} and {\tt -numbatches} are specified,
+then the signals are divided into 256 or fewer batches. A batch will
+always contain at least 500 signals.
+
+These batches can be processed on the local hardware ({\tt -local}
+option) or on the Celera internal compute farm ({\tt -farm} option).
+If they are processed locally, then $p$ {\tt Sim4} processes will run
+concurrently (regardless of the number of CPUs actually available ---
+yes, it is possible to do {\tt -local 256}. It is optimal to run
+exactly one {\tt Sim4} process per available processor. If they are
+processed on the farm, they are submitted to the specified queue /
+project name. The \ESTmapper\ will terminate immediately after the
+jobs are submitted; when they have finished, simply rerun the script
+to continue.
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{assembleoutput}
+\label{sec:assembleoutput}
+
+This stage collects the output from the polishing stage, and classifies the
+matches into quality groups.
+
+\begin{tabular}{lp{3.0in}}
+
+-assembleoutput {\it label} &
+Instruct the \ESTmapper\ to assemble the output. The {\tt
+label} must be the same as used in the search stage.
+\\
+-good {\it mi} &
+Labels matches with $mi$ or better percent alignment-sequence identity as ``good''.
+\\
+-short {\it mc} &
+Labels matches with less than $mc$ percent query-sequence identity as ``short''.
+
+\end{tabular}
+
+{\bf Note about the filter values being at least +5 the polish values here....}
+
+\end{document}
diff --git a/ESTmapper LaTeX/blurb b/ESTmapper LaTeX/blurb
new file mode 100644
index 0000000..455ef97
--- /dev/null
+++ b/ESTmapper LaTeX/blurb
@@ -0,0 +1,32 @@
+The ESTMapper is a software package designed to efficiently map large EST data
+sets to a target genome. For each cDNA (EST or full-length mRNA) sequence
+in the input set, it will determine a set of instances of the EST in the
+target genome in a three-stage process. Stage I, 'signal finding', is an
+efficient similarity search which identifies potential EST-containing regions
+in the reference genome. In Stage II, 'signal filtering', regions containing
+weak signals are removed based on the extent of the cDNA matched and the
+number of regions. Stage III, 'signal polishing' uses an
+enhanced version of Sim4 to produce spliced alignments between the
+query EST sequence and each of the remaining genomic regions.
+
+Features
+[Input]
+. Simple interface and input presentation, as multi-fasta files.
+. Requires no pre-processing of sequences (typically, vector and quality trimming, contaminant screening, assigning quality values, repeat masking).
+
+[Output]
+. Output formatted as flat files, and XML-feature files, which can be viewed using Celera's Genome Browser.
+. Output filtered by quality (the three? files; also, flexible parameters).
+
+[Implementation]
+. Memory and space efficient (e.g., ).
+. Search uses an efficient . Polishing stage improved for efficiency.
+. Parallel operation to take advantage of multi-processor environement the and for better I/O management.
+
+[Algorithmics]
+. Search - uses a proprietary fast near-identity search program.
+. Search + filtering offer high sensivity at relatively low computational cost.
+. Differential filtering for mRNA and EST sequences takes full advantage of their mapping characteristics to reduce the computational cost for polishing false positives.
+. Efficient screening for repetitive elements.
+. Sim4db - iterative procedure allows detection of multiple occurrences. Improvements for memory efficiency, I/O.
+. No segmentation of the sequences is necessary (e.g., use whole chromosomes), hence matches are not pruned to fit in fixed size intervals (allows arbitrarily long introns).
diff --git a/ESTmapper LaTeX/filter.eps b/ESTmapper LaTeX/filter.eps
new file mode 100644
index 0000000..5bc372b
--- /dev/null
+++ b/ESTmapper LaTeX/filter.eps
@@ -0,0 +1,242 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: filter.eps
+%%Creator: fig2dev Version 3.2 Patchlevel 0-beta3
+%%CreationDate: Fri Oct 26 16:16:59 2001
+%%For: walenz at dsc154p.celera.com (Brian Walenz,3604)
+%%Orientation: Portrait
+%%BoundingBox: 0 0 299 250
+%%Pages: 0
+%%BeginSetup
+%%EndSetup
+%%Magnification: 1.0000
+%%EndComments
+/$F2psDict 200 dict def
+$F2psDict begin
+$F2psDict /mtrx matrix put
+/col-1 {0 setgray} bind def
+/col0 {0.000 0.000 0.000 srgb} bind def
+/col1 {0.000 0.000 1.000 srgb} bind def
+/col2 {0.000 1.000 0.000 srgb} bind def
+/col3 {0.000 1.000 1.000 srgb} bind def
+/col4 {1.000 0.000 0.000 srgb} bind def
+/col5 {1.000 0.000 1.000 srgb} bind def
+/col6 {1.000 1.000 0.000 srgb} bind def
+/col7 {1.000 1.000 1.000 srgb} bind def
+/col8 {0.000 0.000 0.560 srgb} bind def
+/col9 {0.000 0.000 0.690 srgb} bind def
+/col10 {0.000 0.000 0.820 srgb} bind def
+/col11 {0.530 0.810 1.000 srgb} bind def
+/col12 {0.000 0.560 0.000 srgb} bind def
+/col13 {0.000 0.690 0.000 srgb} bind def
+/col14 {0.000 0.820 0.000 srgb} bind def
+/col15 {0.000 0.560 0.560 srgb} bind def
+/col16 {0.000 0.690 0.690 srgb} bind def
+/col17 {0.000 0.820 0.820 srgb} bind def
+/col18 {0.560 0.000 0.000 srgb} bind def
+/col19 {0.690 0.000 0.000 srgb} bind def
+/col20 {0.820 0.000 0.000 srgb} bind def
+/col21 {0.560 0.000 0.560 srgb} bind def
+/col22 {0.690 0.000 0.690 srgb} bind def
+/col23 {0.820 0.000 0.820 srgb} bind def
+/col24 {0.500 0.190 0.000 srgb} bind def
+/col25 {0.630 0.250 0.000 srgb} bind def
+/col26 {0.750 0.380 0.000 srgb} bind def
+/col27 {1.000 0.500 0.500 srgb} bind def
+/col28 {1.000 0.630 0.630 srgb} bind def
+/col29 {1.000 0.750 0.750 srgb} bind def
+/col30 {1.000 0.880 0.880 srgb} bind def
+/col31 {1.000 0.840 0.000 srgb} bind def
+
+end
+save
+-71.0 307.0 translate
+1 -1 scale
+
+/cp {closepath} bind def
+/ef {eofill} bind def
+/gr {grestore} bind def
+/gs {gsave} bind def
+/sa {save} bind def
+/rs {restore} bind def
+/l {lineto} bind def
+/m {moveto} bind def
+/rm {rmoveto} bind def
+/n {newpath} bind def
+/s {stroke} bind def
+/sh {show} bind def
+/slc {setlinecap} bind def
+/slj {setlinejoin} bind def
+/slw {setlinewidth} bind def
+/srgb {setrgbcolor} bind def
+/rot {rotate} bind def
+/sc {scale} bind def
+/sd {setdash} bind def
+/ff {findfont} bind def
+/sf {setfont} bind def
+/scf {scalefont} bind def
+/sw {stringwidth} bind def
+/tr {translate} bind def
+/tnt {dup dup currentrgbcolor
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
+ bind def
+/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
+ 4 -2 roll mul srgb} bind def
+/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
+/$F2psEnd {$F2psEnteredState restore end} def
+%%EndProlog
+
+$F2psBegin
+10 setmiterlimit
+n -1000 6112 m -1000 -1000 l 7162 -1000 l 7162 6112 l cp clip
+ 0.06000 0.06000 sc
+% Arc
+7.500 slw
+gs n 2362.5 1612.5 265.2 -81.9 171.9 arcn
+gs col0 s gr
+ gr
+
+% Polyline
+n 1950 4200 m 2250 3900 l gs col0 s gr
+% Polyline
+n 2325 3825 m 2625 3525 l gs col0 s gr
+% Polyline
+n 2700 3375 m 2925 3150 l gs col0 s gr
+% Polyline
+n 3900 2250 m 4200 1950 l gs col0 s gr
+% Polyline
+n 3525 2550 m 3825 2250 l gs col0 s gr
+% Polyline
+n 3150 3000 m 3450 2700 l gs col0 s gr
+% Polyline
+gs clippath
+5292 1041 m 5380 994 l 5334 1083 l 5432 986 l 5389 943 l cp
+1383 5034 m 1294 5080 l 1341 4992 l 1243 5089 l 1286 5132 l cp
+clip
+n 1275 5100 m 5400 975 l gs col0 s gr gr
+
+% arrowhead
+n 1383 5034 m 1294 5080 l 1341 4992 l 1379 4996 l 1383 5034 l cp gs 0.00 setgray ef gr col0 s
+% arrowhead
+n 5292 1041 m 5380 994 l 5334 1083 l 5296 1079 l 5292 1041 l cp gs 0.00 setgray ef gr col0 s
+% Polyline
+n 1575 4800 m 1425 4800 l 5025 1200 l 5175 1200 l cp gs 0.75 setgray ef gr gs col0 s gr
+% Polyline
+n 1725 4575 m 2025 4275 l gs col0 s gr
+% Polyline
+n 4125 2250 m 5550 2250 l 5550 1200 l 5175 1200 l cp gs 0.90 setgray ef gr gs col0 s gr
+% Polyline
+n 1575 4800 m 2250 4125 l 2250 4800 l cp gs 0.90 setgray ef gr gs col0 s gr
+% Polyline
+n 2250 4800 m 2250 4125 l 4125 2250 l 5550 2250 l 5550 4800 l cp gs 0.50 setgray ef gr gs col7 s gr
+% Polyline
+n 3375 3600 m 3675 3300 l gs col0 s gr
+% Polyline
+ [60] 0 sd
+n 1950 1950 m 4200 1950 l 4200 4200 l 1950 4200 l cp gs col0 s gr [] 0 sd
+% Polyline
+n 1200 1200 m 1200 4800 l 5550 4800 l gs col0 s gr
+% Polyline
+ [60] 0 sd
+n 5850 4800 m 5550 4800 l gs col0 s gr [] 0 sd
+% Arc
+gs n 1612.5 2362.5 265.2 -81.9 171.9 arcn
+gs col0 s gr
+ gr
+
+% Polyline
+ [15 60] 60 sd
+n 6150 4800 m 5850 4800 l gs col0 s gr [] 0 sd
+/Times-Roman ff 180.00 scf sf
+3300 1500 m
+gs 1 -1 sc (diagonal difference) col0 sh gr
+% Polyline
+gs clippath
+6027 2220 m 6123 2250 l 6027 2280 l 6165 2280 l 6165 2220 l cp
+1623 2280 m 1527 2250 l 1623 2220 l 1485 2220 l 1485 2280 l cp
+clip
+n 1500 2250 m 6150 2250 l gs col0 s gr gr
+
+% arrowhead
+n 1623 2280 m 1527 2250 l 1623 2220 l 1647 2250 l 1623 2280 l cp gs 0.00 setgray ef gr col0 s
+% arrowhead
+n 6027 2220 m 6123 2250 l 6027 2280 l 6003 2250 l 6027 2220 l cp gs 0.00 setgray ef gr col0 s
+% Polyline
+n 5325 2100 m 5625 1800 l gs col0 s gr
+% Polyline
+gs clippath
+2280 4977 m 2250 5073 l 2220 4977 l 2220 5115 l 2280 5115 l cp
+2220 1623 m 2250 1527 l 2280 1623 l 2280 1485 l 2220 1485 l cp
+clip
+n 2250 1500 m 2250 5100 l gs col0 s gr gr
+
+% arrowhead
+n 2220 1623 m 2250 1527 l 2280 1623 l 2250 1647 l 2220 1623 l cp gs 0.00 setgray ef gr col0 s
+% arrowhead
+n 2280 4977 m 2250 5073 l 2220 4977 l 2250 4953 l 2280 4977 l cp gs 0.00 setgray ef gr col0 s
+% Polyline
+n 1950 1800 m 1950 1500 l gs col0 s gr
+% Polyline
+n 1800 1950 m 1500 1950 l gs col0 s gr
+% Polyline
+gs clippath
+2163 1635 m 2223 1650 l 2163 1665 l 2265 1665 l 2265 1635 l cp
+2037 1665 m 1977 1650 l 2037 1635 l 1935 1635 l 1935 1665 l cp
+clip
+n 1950 1650 m 2250 1650 l gs col0 s gr gr
+
+% arrowhead
+n 2037 1665 m 1977 1650 l 2037 1635 l col0 s
+% arrowhead
+n 2163 1635 m 2223 1650 l 2163 1665 l col0 s
+% Polyline
+gs clippath
+1665 2163 m 1650 2223 l 1635 2163 l 1635 2265 l 1665 2265 l cp
+1635 2037 m 1650 1977 l 1665 2037 l 1665 1935 l 1635 1935 l cp
+clip
+n 1650 1950 m 1650 2250 l gs col0 s gr gr
+
+% arrowhead
+n 1635 2037 m 1650 1977 l 1665 2037 l col0 s
+% arrowhead
+n 1665 2163 m 1650 2223 l 1635 2163 l col0 s
+% Polyline
+gs clippath
+4749 1353 m 4780 1405 l 4728 1374 l 4800 1446 l 4821 1425 l cp
+clip
+n 4800 1425 m 4650 1275 l gs col0 s gr gr
+
+% arrowhead
+n 4749 1353 m 4780 1405 l 4728 1374 l col0 s
+% Polyline
+gs clippath
+4926 1572 m 4894 1519 l 4947 1551 l 4875 1479 l 4854 1500 l cp
+clip
+n 4875 1500 m 5025 1650 l gs col0 s gr gr
+
+% arrowhead
+n 4926 1572 m 4894 1519 l 4947 1551 l col0 s
+/Times-Roman ff 180.00 scf sf
+1500 4575 m
+gs 1 -1 sc (A) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+3225 3600 m
+gs 1 -1 sc (B) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+5175 2100 m
+gs 1 -1 sc (C) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1275 2700 m
+gs 1 -1 sc (overlap) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1275 2550 m
+gs 1 -1 sc (genomic) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2400 1500 m
+gs 1 -1 sc (overlap) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2400 1350 m
+gs 1 -1 sc (query) col0 sh gr
+$F2psEnd
+rs
diff --git a/ESTmapper LaTeX/filter.fig b/ESTmapper LaTeX/filter.fig
new file mode 100644
index 0000000..2b32ae7
--- /dev/null
+++ b/ESTmapper LaTeX/filter.fig
@@ -0,0 +1,83 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter
+100.00
+Single
+-2
+1200 2
+5 1 0 1 0 7 100 0 -1 4.000 0 1 0 0 1612.500 2362.500 1650 2100 1425 2175 1350 2400
+5 1 0 1 0 7 100 0 -1 4.000 0 1 0 0 2362.500 1612.500 2400 1350 2175 1425 2100 1650
+2 1 0 1 0 7 100 0 -1 3.000 0 0 -1 0 0 2
+ 1950 4200 2250 3900
+2 1 0 1 0 7 100 0 -1 3.000 0 0 -1 0 0 2
+ 2325 3825 2625 3525
+2 1 0 1 0 7 100 0 -1 3.000 0 0 -1 0 0 2
+ 2700 3375 2925 3150
+2 1 0 1 0 7 100 0 -1 3.000 0 0 -1 0 0 2
+ 3900 2250 4200 1950
+2 1 0 1 0 7 100 0 -1 3.000 0 0 -1 0 0 2
+ 3525 2550 3825 2250
+2 1 0 1 0 7 100 0 -1 3.000 0 0 -1 0 0 2
+ 3150 3000 3450 2700
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 1 2
+ 3 1 1.00 60.00 120.00
+ 3 1 1.00 60.00 120.00
+ 1275 5100 5400 975
+2 3 0 1 0 0 100 0 5 0.000 0 0 -1 0 0 5
+ 1575 4800 1425 4800 5025 1200 5175 1200 1575 4800
+2 1 0 1 0 7 100 0 -1 3.000 0 0 -1 0 0 2
+ 1725 4575 2025 4275
+2 3 0 1 0 0 100 0 2 0.000 0 0 -1 0 0 5
+ 4125 2250 5550 2250 5550 1200 5175 1200 4125 2250
+2 3 0 1 0 0 100 0 2 0.000 0 0 -1 0 0 4
+ 1575 4800 2250 4125 2250 4800 1575 4800
+2 3 0 1 7 0 100 0 10 0.000 0 0 -1 0 0 6
+ 2250 4800 2250 4125 4125 2250 5550 2250 5550 4800 2250 4800
+2 1 0 1 0 7 100 0 -1 3.000 0 0 -1 0 0 2
+ 3375 3600 3675 3300
+2 2 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 5
+ 1950 1950 4200 1950 4200 4200 1950 4200 1950 1950
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 3
+ 1200 1200 1200 4800 5550 4800
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+ 5850 4800 5550 4800
+2 1 2 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+ 6150 4800 5850 4800
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 1 2
+ 3 1 1.00 60.00 120.00
+ 3 1 1.00 60.00 120.00
+ 1500 2250 6150 2250
+2 1 0 1 0 7 100 0 -1 3.000 0 0 -1 0 0 2
+ 5325 2100 5625 1800
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 1 2
+ 3 1 1.00 60.00 120.00
+ 3 1 1.00 60.00 120.00
+ 2250 1500 2250 5100
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+ 1950 1800 1950 1500
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+ 1800 1950 1500 1950
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 1 2
+ 0 0 1.00 30.00 60.00
+ 0 0 1.00 30.00 60.00
+ 1950 1650 2250 1650
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 1 2
+ 0 0 1.00 30.00 60.00
+ 0 0 1.00 30.00 60.00
+ 1650 1950 1650 2250
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 0 1 2
+ 0 0 1.00 30.00 60.00
+ 4800 1425 4650 1275
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 0 1 2
+ 0 0 1.00 30.00 60.00
+ 4875 1500 5025 1650
+4 0 0 100 0 0 12 0.0000 0 105 120 1500 4575 A\001
+4 0 0 100 0 0 12 0.0000 0 105 90 3225 3600 B\001
+4 0 0 100 0 0 12 0.0000 0 105 105 5175 2100 C\001
+4 0 0 100 0 0 12 0.0000 0 150 465 1275 2700 overlap\001
+4 0 0 100 0 0 12 0.0000 0 135 510 1275 2550 genomic\001
+4 0 0 100 0 0 12 0.0000 0 150 465 2400 1500 overlap\001
+4 0 0 100 0 0 12 0.0000 0 120 345 2400 1350 query\001
+4 0 0 100 0 0 12 0.0000 0 135 1170 3300 1500 diagonal difference\001
diff --git a/ESTmapper LaTeX/hash-tables.tex b/ESTmapper LaTeX/hash-tables.tex
new file mode 100644
index 0000000..1dacea9
--- /dev/null
+++ b/ESTmapper LaTeX/hash-tables.tex
@@ -0,0 +1,200 @@
+\documentclass[twoside, twocolumn, 10pt]{article}
+\usepackage{amsmath,amssymb}
+\usepackage{moreverb}
+\usepackage{fancyheadings}
+\usepackage{ulem}
+\usepackage{parskip}
+\usepackage{calc,ifthen,epsfig}
+\sloppy
+
+% A few float parameters
+%
+\renewcommand{\dbltopfraction}{0.9}
+\renewcommand{\dblfloatpagefraction}{0.9}
+%\renewcommand{\textfraction}{0.05}
+
+
+\begin{document}
+
+% See page 63-64, LaTeX Companion
+%
+% leftmargin controls the left margin for EVERYTHING in the list!
+%
+\newcommand{\entrylabel}[1]{\mbox{\texttt{#1:}}\hfil}
+\newenvironment{entry}
+ {\begin{list}{}%
+ {\renewcommand{\makelabel}{\entrylabel}%
+ %\setlength{\leftmargin}{1.5in}%
+ }}
+{\end{list}}
+
+% The first parbox width controls the indent on the first text line
+% The makebox width seems to do nothing.
+\newcommand{\Lentrylabel}[1]{%
+ {\parbox[b]{0pt}{\makebox[0pt][l]{\texttt{#1:}}\\}}\hfil\relax}
+\newenvironment{Lentry}
+ {\renewcommand{\entrylabel}{\Lentrylabel}\begin{entry}}
+ {\end{entry}}
+
+\title{ESTmapper documentation}
+\author{
+Liliana Florea\thanks{liliana.florea at celera.com},
+Brian P. Walenz\thanks{brian.walenz at celera.com}}
+
+\maketitle
+
+\pagestyle{fancy}
+
+\rhead[]{}
+\chead[ESTmapper]{ESTmapper}
+\lhead[\today]{\today}
+
+\normalem
+
+\newcommand{\ESTmapper}{{\sc ESTmapper\ }}
+
+\begin{abstract}
+The gory details of the \ESTmapper process is described.
+
+
+
+
+
+
+\subsection{Hash Function Definitions}
+
+In the discussion that follows, let $A$ be an encoded mer, $H$ be the
+hashed value of the mer, and $C$ be the check value. $A$ is $m$ bits
+wide, $H$ is $h$ bits wide and $C$ is $m-h$ bits wide.
+
+Our hash and check functions must satisfy the following properties:
+\begin{align*}
+f_H &: m \rightarrow h \\
+f_C &: m \rightarrow c \\
+f_R &: h \times c \rightarrow m
+\end{align*}
+such that $f_R(f_H(A), f_C(A)) = A$.
+
+Furthermore, $f_H$ should be a good hash function, {\bf whatever that means}.
+The functions are explained in Section~\ref{sec:hashfcn}.
+
+\subsection{existDB}
+
+The {\tt existDB} will tell us if a mer exists in a sequence.
+
+We can build the structure in Figure~\ref{fig:hashstruct} in five
+steps, using $\theta(2 \cdot 2^h + 2 \cdot n)$ time and no temporary
+space.
+
+\begin{enumerate}
+\item
+Allocate and zero $2^h$ integers for the hash table.
+\item
+Count the size of each bucket: hash each mer, increment the size of
+that bucket. This can be done using the space for the hash table.
+Also count the number of mers.
+\item
+Allocate $n$ bucket entries, one for each mer. There is no need to
+initialize these.
+\item
+Make the hash table entry $i$ point to the start of bucket $i$. Note
+that the hash table entry $i+1$ can be used to find the end of bucket
+$i$.
+\item
+Rehash each mer, inserting the check value into the next available
+bucket entry (use the hash table to keep track of the next available
+entry). The buckets contain all the mers after this step, and the
+hash table is off by one -- entry $i$ points to the start of bucket
+$i+1$. If we offset the start of the table, we can fix this in $O(1)$
+time.
+\end{enumerate}
+
+We assume that the input sequence does not contain duplicate mers. If
+it does, we should remove them from the table.
+
+\subsection{positionDB}
+
+We can extend the {\tt existDB} to store position information by
+storing, in the bucket entry, either the position of the mer (if there
+is exactly one copy) or a pointer to a list of positions.
+
+Unlike the {\tt existDB} we now need to remove duplicate mers from the
+table.
+
+\begin{enumerate}
+\item
+count bucket size - this overcounts the true size; it counts
+duplicates
+\item
+allocate counting buckets - build a list of hashed mers and the position
+that they occur.
+\item
+sort each bucket
+\item
+allocate the final hash table, buckets and position lists
+{\bf XXX} can we reuse the hash table and bucket space??
+\item
+copy the counting buckets into the final structure. mers that occur
+exactly once have their position stored in the bucket entry. mers
+that occur more than once have a pointer to the position list
+placed in the bucket entry.
+\end{enumerate}
+
+\subsection{A Good Hash Function}
+\label{sec:hashfcn}
+
+A simple hash function would be to use the highest $h$ bits of the
+encoded mer as the hash, and use the lowest $m-h$ bits of the mer
+as the check. Unfortunately, this is a very poor hash function ---
+the hash function is strongly correlated with the input mer. {\bf needs more blah blah}
+
+A better hash function would first ``scramble'' the bits in the mer to
+break the correlation between the input and the output.
+
+In the discussion that follows, let $A$ be an encoded mer, $H$ be the
+hashed value of the mer, and $C$ be the collision resolution value.
+$A$ is $m$ bits wide, $H$ is $h$ bits wide and $C$ is $m-h$ bits wide.
+
+We want to find functions
+$f_H : m \rightarrow h$,
+$f_C : m \rightarrow c$,
+$f_R : h \times c \rightarrow m$
+such that
+\begin{align*}
+f_H(A) =& H \\
+f_C(A) =& C \\
+f_R(H,C) =& A
+\end{align*}
+Furthermore, $f_H$ should be a good hash function.
+
+We specify $f_H$ and $f_C$ by specifying each bit in the output.
+\begin{align*}
+H_i &= A_{i} \oplus A_{i-\frac{m-h}{2}} \oplus A_{i+m-h}, \text{ for } 1 \le i \le h \\
+C_i &= A_{i}, \text{ for } 1 \le i \le m-h
+\end{align*}
+Likewise, $f_R$ can be expressed as
+\begin{align*}
+A_i &=
+\begin{cases}
+C_i & 1 \le i \le m-h \\
+A_{i-m+h} \oplus A_{i-\frac{m+h}{2}} \oplus H_{i-m+h} & m-h < i \le m
+\end{cases}
+\end{align*}
+In C code
+\begin{verbatim}
+u64bit fH(u64bit A) {
+ return(((A) ^
+ (A >> (m-h)/2) ^
+ (A >> (m-h))) & MASK(h));
+}
+
+u64bit fC(u64bit A) {
+ return(A & MASK(m-h));
+}
+\end{verbatim}
+where {\tt u64bit} is a 64-bit unsigned integer type. The code for
+$f_R$ is non-trivial, and is not needed.
+
+
+\end{document}
+
diff --git a/ESTmapper LaTeX/hit-filtering.tex b/ESTmapper LaTeX/hit-filtering.tex
new file mode 100644
index 0000000..47356ba
--- /dev/null
+++ b/ESTmapper LaTeX/hit-filtering.tex
@@ -0,0 +1,358 @@
+\documentclass[twoside, twocolumn, 10pt]{article}
+\usepackage{amsmath,amssymb}
+\usepackage{moreverb}
+\usepackage{fancyheadings}
+\usepackage{ulem}
+\usepackage{parskip}
+\usepackage{calc,ifthen,epsfig}
+\sloppy
+
+% A few float parameters
+%
+\renewcommand{\dbltopfraction}{0.9}
+\renewcommand{\dblfloatpagefraction}{0.9}
+%\renewcommand{\textfraction}{0.05}
+
+
+\begin{document}
+
+% See page 63-64, LaTeX Companion
+%
+% leftmargin controls the left margin for EVERYTHING in the list!
+%
+\newcommand{\entrylabel}[1]{\mbox{\texttt{#1:}}\hfil}
+\newenvironment{entry}
+ {\begin{list}{}%
+ {\renewcommand{\makelabel}{\entrylabel}%
+ %\setlength{\leftmargin}{1.5in}%
+ }}
+{\end{list}}
+
+% The first parbox width controls the indent on the first text line
+% The makebox width seems to do nothing.
+\newcommand{\Lentrylabel}[1]{%
+ {\parbox[b]{0pt}{\makebox[0pt][l]{\texttt{#1:}}\\}}\hfil\relax}
+\newenvironment{Lentry}
+ {\renewcommand{\entrylabel}{\Lentrylabel}\begin{entry}}
+ {\end{entry}}
+
+\title{ESTmapper documentation}
+\author{
+Liliana Florea\thanks{liliana.florea at celera.com},
+Brian P. Walenz\thanks{brian.walenz at celera.com}}
+
+\maketitle
+
+\pagestyle{fancy}
+
+\rhead[]{}
+\chead[ESTmapper]{ESTmapper}
+\lhead[\today]{\today}
+
+\normalem
+
+\newcommand{\ESTmapper}{{\sc ESTmapper\ }}
+
+\begin{abstract}
+The gory details of the \ESTmapper process is described.
+
+
+
+
+
+
+
+
+
+
+
+
+
+\subsection{Hit Filtering}
+
+\begin{figure*}
+\begin{center}
+\epsfig{figure=filter.eps, silent=, width=4.5in}
+\end{center}
+\caption{Diagram of the match-building algorithm. The dotted-box represents
+the extent of the current match. Lines with arrows define regions of action.
+If the next mer falls in the dark region, the current match is evaluated and
+potentially saved; if the next mer falls into any of the lighter areas,
+the mer is added to the current match, and the current match is extended.
+Note that we have processed all mers in the white region.
+We process hit A next. As it is in a light region, it is added to the
+match, and the match is extended. Hit B will break the current match,
+so it is evaluated and saved. A new match region is formed,
+encompassing only hit B. Hit C would extend the new match region.}
+\label{fig:hitfiltering}
+\end{figure*}
+
+The goal of filtering is to take a set of mers, and isolate subsets
+that look like cDNA matches. That is, we want to find a subset of
+hits that form a nearly-idential alignment, but could have large gaps
+in the genomic sequence (introns).
+
+This is done in two passes. The first pass will detect all
+nearly-identical regions, some of these regions will be grouped into
+exon-intron structures. The second pass will examine the regions, and
+merge those that are in approximately the same genomic area.
+
+The hits in a region of near identity will all be on nearly the same
+diagonal. By sorting the hits by the diagonal they are on, we can
+quickly find a subset of hits that form a nearly identical match
+because they will be consecutive in the list.
+
+The first pass is shown in Figure~\ref{fig:hitfiltering}. In the
+figure, a large dashed-box represents the extent of the current
+matching region, the lines with arrows are various distance thresholds
+and divide the space into three regions (dark, light and white).
+
+The white region contains exactly those hits that we have processed
+thus far. If the next hit in the list falls into one of the lightly
+shaded regions, it is added to the current match. If the next hit
+falls into the darkly shaded region, it terminated the current match.
+
+When the current match is terminated, it is evaluated to decide if it
+is a significant match or not. Two classes of matches are possible:
+single exon or multiple exon (based on the size of the diagonal). If
+a single exon match contains more than $X$ exact base matches, the
+match region is saved. If a multiple exon match contains more than
+$Y$ exact base matches, the match region is saved. Otherwise, the
+match region is discarded, and a new match region is created which
+contains only the current hit.
+
+When a match is saved, we only need to save the coordinates in the
+genomic sequence. Essentially, we are saying ``There might be some
+piece of the cDNA on this genomic region''. We extend each side of
+the saved region by an amount proportional to the amount of cDNA that
+was not represented by the match. {\bf need to explain why}
+
+\subsection{Match Merging}
+
+Because of the extension of matches, some matches might be
+overlapping, or close enough to consider the same match. The final
+step is to scan the list of matches and merge those that are close.
+
+\subsection{output}
+
+Matches are scored by the number of exact base matches they have. We
+probably want to normalize this to [0,1] somehow, but should also use
+number of exons, etc., etc.
+
+\section{What is a signal}
+
+Signal has three values associated with it. The amount 'covered', the
+amount 'matched' and the total 'length'.
+%
+The amount covered is the number of
+bases in the mRNA that are contained in least one mer.
+%
+The amount matched is the number of paired bases (for example, position
+$i$ in the cDNA paired with position $j$ in the genomic) covered by a mer.
+%
+The length is the number of mers in the mRNA (roughly equivalent to the
+number of bases in the mRNA that could be covered by a mer, but easier
+to compute).
+
+From these, we can derive two scores, the coverage and the multiplicity.
+The coverage, $\frac{covered}{length}$, represents the fraction of the mRNA
+that we found, while the multiplicity, $\frac{matched}{covered}$, represents
+the amount of the mRNA that we found too many times.
+
+A high multiplicity usually indicates a repeat-containing mRNA. High
+multiplicity and high coverage can indicate that the mRNA is not cDNA.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Filtering, first try}
+
+Written around 4 December 2001.
+
+\subsection{Output of the Search}
+
+The search outputs hits. Each hit is made up of one or more exon-like
+matches. Each match can represent one or more exons, as long as the
+mers in the match do not show inconsistent overlap.
+
+[Insert three pictures here -- single exon match, multiple exon match,
+inconsistent mers]
+
+Each match is extended by a value determined by the amount of un-matched
+query sequence.
+
+Each match is scored by the number of mers it contains.
+
+Once the list of matches is found, any matches within 5Kb are merged.
+The number of mers in a merged match is the sum of the number of mers
+in its pieces.
+
+The list of merged matches is then output as hits.
+
+Hits are scored by:
+
+(number of mers in the merged matche) / (number or mers in the query)
+
+Number of mers in the query is the number of valid mers -- the
+high-frequency mers and mers containing 'N' are not counted.
+
+A score of 1.0 is perfect; a score less than 1.0 indicates a partial
+match, while a score more than 1.0 indicates a duplicate gene nearby,
+or a large spurious match.
+
+\subsection{Filtering of Hits}
+
+Given all hits for a specific query sequence, we filter them by throwing
+away the lowest scoring ones. (duh!)
+
+A low-score cutoff if determined with:
+
+ cutoff = cutoffScale * (highestScore - lowestScore) + lowestScore
+
+where cutoffScale is a parameter to decide how aggressive the
+filtering is (1.0 is perfectly aggressive, 0.3 is reasonable).
+
+Then all hits with score < cutoff are discarded.
+
+\subsection{Modifications}
+
+Occasionally, large spurious matches are found (e.g., ???). The score
+of the spurious match can be significantly better than the score of the
+real match, which will cause the real match to be filtered out. To
+compensate for this, the value of highestScore in the cutoff computation
+is modified to be
+
+max(lowestScore, min(1.0, highestScore))
+
+\subsection{Discussion}
+
+As the search is done over the whole genome, and all hits are used
+to determine the highestScore and lowestScore blah, blah, blah.
+
+
+if highestScore $<$ 1.0 -- unmatched mers are assumed to be in error,
+either in the query or the genome. We will never find these mers, so
+we should reduce the aggressiveness of the filter to account for this.
+
+if highestScore $>$ 1.0 -- the best hit is probably bogus, and we still
+want to polish hits down to (about) the same level as if the best hit
+were a perfect hit. Thus, threshold the highestScore to be a perfect
+hit.
+
+Finally, we need to make the highestScore at least the lowestScore, in
+the extreme case that the worst hit is greater than 1.0. Ha, ha. Why
+are you searching for repeats, anyway?
+
+\subsection{Implementation Detail}
+
+A CPU-time limit is imposed when polishing hits for queries that have
+a hit with score greater than 1.5. This solves the nasty case when we
+get a chunk of genomic as input, and it matches an entire chromosome
+with several hundred exon-like things, and it takes hours to polish.
+
+The more correct thing to do is to abort ANY polish that takes more
+than 60 seconds, not just suspicious looking ones. Software
+engineering issue.
+
+To do this correctly, we would need to register all memory allocated by
+sim4(), and free it when a timer goes off. How to actually return from
+the sim4()? Without longjmp()?
+
+If we use threads, is this easier? Have the master thread abort the
+slave ({\tt pthread\_cancel})? Still have the memory deallocation problem.
+({\tt pthread\_cleanup\_push} can do it, if we keep a list of allocations)
+
+\section{Filtering EST signals}
+
+\section{Filtering mRNA signals}
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|p{0.3in}|p{1.25in}|}
+\hline
+Switch & Variable & Def. Value & Description \\
+\hline
+\hline
+-l & $L$ & 0.2 & Signal spread low range \\
+-h & $H$ & 0.6 & Signal spread high range \\
+-v & $V$ & 0.3 & Pass value \\
+-m & $M$ & 0.3 & Signal quality floor \\
+-mc & $M_c$ & 0.2 & Minimum signal quality \\
+-ml & $M_l$ & 150 & Minimum signal size \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Parameters, default values and descriptions}
+\label{table:defvalues}
+\end{figure}
+
+In order to filter signals, we need to decide, for each mRNA, which
+signals are bad, and which are good (duh!), which means that we'll
+need to look at {\em all} signals for a single mRNA.
+
+For the filter presented below, we need to know the best and worst
+coverage values that occur for any signal associated with a specific
+mRNA. Once those are known, the signals can be filtered in any order.
+This is important in the case where the signals are detected
+chromosome by chromosome. Instead of sorting all signals, we can save
+the best and worst coverage for each mRNA.
+
+The filter has six parameters, summarized in Table~\ref{table:defvalues}.
+
+If the signals for a specific mRNA are all very similar, it is
+probable that the weaker signals are weak only because of a few
+mismatches that break 20-mers. In this case, we cannot reliably pick
+the signals that are true, and should consider all of them.
+
+On the other hand, if there is a large range in the quality of signals,
+we can safely discard low scoring signals, and still be confident that
+we will find the good stuff.
+
+Therefore, the filter will discard no signals if the range in quality
+values is small, and will gradually discard more, proportional to the
+range. So that we don't discard too much, we limit the increase in
+filtering to $V$ (0.3).
+\begin{align*}
+h &= bestCoverage - worstCoverage \\
+p &= \begin{cases}
+ 0.0 & \text{if $h \le L$} \\
+ V * \frac{h-L}{H-L} & \text{if $L < h < H$} \\
+ V & \text{if $H \le h$}
+ \end{cases} \\
+c &= min(worstCoverage + p \cdot h, M)
+\end{align*}
+
+\begin{figure*}
+\begin{center}
+\epsfig{figure=mRNAfilt.eps, silent=, width=4.5in}
+\end{center}
+\caption{The $p$ curve.}
+\label{fig:pcurve}
+\end{figure*}
+
+$p$ is the amount of filtering, ranging from minimum (0.0) to maximum
+($V$, a parameter).
+
+The $c$ value computed above is the filtering threshold. Signals with
+coverage below $c$ are considered weak, and are discarded.
+
+If the score range is small ($\le L$), then $c$ will be
+$worstCoverage$, and we do no filtering. If the score range is large
+($\ge H$), then $c$ will be $M$ of the best score. $c$ is the minimum
+coverage that will be accepted. It is derived from the range of
+scores, not the number of scores.
+
+Finally, it is possible that {\em all} signals are good. If we used the
+above filtering we would be discarding the low scoring (but still valid)
+signals. To overcome this, absolute limits $M_c$ and $M_l$ are enforced.
+
+A signal is saved if both of the following conditions are met:
+\begin{enumerate}
+\item ($c <= coverage$)
+\item ($M_c <= coverage$) or ($M_l <= coveredBases$)
+\end{enumerate}
+
+This filter is overly permissive, throwing out only signals that are
+obviously garbage.
+
+
+\end{document}
+
diff --git a/ESTmapper LaTeX/mRNAfilt.eps b/ESTmapper LaTeX/mRNAfilt.eps
new file mode 100644
index 0000000..91586d4
--- /dev/null
+++ b/ESTmapper LaTeX/mRNAfilt.eps
@@ -0,0 +1,239 @@
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: mRNAfilt.eps
+%%Creator: fig2dev Version 3.2.3 Patchlevel
+%%CreationDate: Thu Jan 17 19:14:55 2002
+%%For: walenz at fengshui.home (Brian Walenz)
+%%BoundingBox: 0 0 469 316
+%%Magnification: 1.0000
+%%EndComments
+/$F2psDict 200 dict def
+$F2psDict begin
+$F2psDict /mtrx matrix put
+/col-1 {0 setgray} bind def
+/col0 {0.000 0.000 0.000 srgb} bind def
+/col1 {0.000 0.000 1.000 srgb} bind def
+/col2 {0.000 1.000 0.000 srgb} bind def
+/col3 {0.000 1.000 1.000 srgb} bind def
+/col4 {1.000 0.000 0.000 srgb} bind def
+/col5 {1.000 0.000 1.000 srgb} bind def
+/col6 {1.000 1.000 0.000 srgb} bind def
+/col7 {1.000 1.000 1.000 srgb} bind def
+/col8 {0.000 0.000 0.560 srgb} bind def
+/col9 {0.000 0.000 0.690 srgb} bind def
+/col10 {0.000 0.000 0.820 srgb} bind def
+/col11 {0.530 0.810 1.000 srgb} bind def
+/col12 {0.000 0.560 0.000 srgb} bind def
+/col13 {0.000 0.690 0.000 srgb} bind def
+/col14 {0.000 0.820 0.000 srgb} bind def
+/col15 {0.000 0.560 0.560 srgb} bind def
+/col16 {0.000 0.690 0.690 srgb} bind def
+/col17 {0.000 0.820 0.820 srgb} bind def
+/col18 {0.560 0.000 0.000 srgb} bind def
+/col19 {0.690 0.000 0.000 srgb} bind def
+/col20 {0.820 0.000 0.000 srgb} bind def
+/col21 {0.560 0.000 0.560 srgb} bind def
+/col22 {0.690 0.000 0.690 srgb} bind def
+/col23 {0.820 0.000 0.820 srgb} bind def
+/col24 {0.500 0.190 0.000 srgb} bind def
+/col25 {0.630 0.250 0.000 srgb} bind def
+/col26 {0.750 0.380 0.000 srgb} bind def
+/col27 {1.000 0.500 0.500 srgb} bind def
+/col28 {1.000 0.630 0.630 srgb} bind def
+/col29 {1.000 0.750 0.750 srgb} bind def
+/col30 {1.000 0.880 0.880 srgb} bind def
+/col31 {1.000 0.840 0.000 srgb} bind def
+
+end
+save
+newpath 0 316 moveto 0 0 lineto 469 0 lineto 469 316 lineto closepath clip newpath
+-72.0 387.0 translate
+1 -1 scale
+
+/cp {closepath} bind def
+/ef {eofill} bind def
+/gr {grestore} bind def
+/gs {gsave} bind def
+/sa {save} bind def
+/rs {restore} bind def
+/l {lineto} bind def
+/m {moveto} bind def
+/rm {rmoveto} bind def
+/n {newpath} bind def
+/s {stroke} bind def
+/sh {show} bind def
+/slc {setlinecap} bind def
+/slj {setlinejoin} bind def
+/slw {setlinewidth} bind def
+/srgb {setrgbcolor} bind def
+/rot {rotate} bind def
+/sc {scale} bind def
+/sd {setdash} bind def
+/ff {findfont} bind def
+/sf {setfont} bind def
+/scf {scalefont} bind def
+/sw {stringwidth} bind def
+/tr {translate} bind def
+/tnt {dup dup currentrgbcolor
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
+ bind def
+/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
+ 4 -2 roll mul srgb} bind def
+/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
+/$F2psEnd {$F2psEnteredState restore end} def
+
+$F2psBegin
+%%Page: 1 1
+10 setmiterlimit
+ 0.06000 0.06000 sc
+/Times-Roman ff 180.00 scf sf
+1800 1575 m
+gs 1 -1 sc (1.0) col0 sh gr
+% Polyline
+7.500 slw
+n 9000 6000 m
+ 2100 6000 l gs col0 s gr
+% Polyline
+n 3000 6300 m
+ 3000 6000 l gs col0 s gr
+% Polyline
+n 3600 6300 m
+ 3600 6000 l gs col0 s gr
+% Polyline
+n 4800 6300 m
+ 4800 6000 l gs col0 s gr
+% Polyline
+n 5400 6300 m
+ 5400 6000 l gs col0 s gr
+% Polyline
+n 6000 6300 m
+ 6000 6000 l gs col0 s gr
+% Polyline
+n 4200 6300 m
+ 4200 6000 l gs col0 s gr
+% Polyline
+n 6600 6300 m
+ 6600 6000 l gs col0 s gr
+% Polyline
+n 7200 6300 m
+ 7200 6000 l gs col0 s gr
+% Polyline
+n 7800 6300 m
+ 7800 6000 l gs col0 s gr
+% Polyline
+n 8400 6300 m
+ 8400 6000 l gs col0 s gr
+% Polyline
+n 2100 5400 m
+ 2400 5400 l gs col0 s gr
+% Polyline
+n 2100 4800 m
+ 2400 4800 l gs col0 s gr
+% Polyline
+n 2100 4200 m
+ 2400 4200 l gs col0 s gr
+% Polyline
+n 2100 3600 m
+ 2400 3600 l gs col0 s gr
+% Polyline
+n 2100 3000 m
+ 2400 3000 l gs col0 s gr
+% Polyline
+30.000 slw
+n 2400 5400 m 3600 5400 l 6000 3600 l
+ 8400 3600 l gs col0 s gr
+% Polyline
+7.500 slw
+ [15 60] 60 sd
+n 2400 3600 m
+ 9000 3600 l gs col0 s gr [] 0 sd
+% Polyline
+ [15 60] 60 sd
+n 3600 2700 m
+ 3600 5700 l gs col0 s gr [] 0 sd
+% Polyline
+ [15 60] 60 sd
+n 6000 2700 m
+ 6000 5700 l gs col0 s gr [] 0 sd
+% Polyline
+n 2100 1500 m
+ 2400 1500 l gs col0 s gr
+/Times-Roman ff 180.00 scf sf
+1275 1800 m
+gs 1 -1 sc (bestCoverage) col0 sh gr
+% Polyline
+n 2400 2700 m
+ 2400 6300 l gs col0 s gr
+/Times-Roman ff 180.00 scf sf
+1800 5475 m
+gs 1 -1 sc (0.0) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1800 4875 m
+gs 1 -1 sc (0.1) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1800 4275 m
+gs 1 -1 sc (0.2) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1800 3675 m
+gs 1 -1 sc (0.3) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1800 3075 m
+gs 1 -1 sc (0.4) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2325 6450 m
+gs 1 -1 sc (0.0) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2925 6450 m
+gs 1 -1 sc (0.1) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+3525 6450 m
+gs 1 -1 sc (0.2) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+4125 6450 m
+gs 1 -1 sc (0.3) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+4725 6450 m
+gs 1 -1 sc (0.4) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+5325 6450 m
+gs 1 -1 sc (0.5) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+5925 6450 m
+gs 1 -1 sc (0.6) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+6525 6450 m
+gs 1 -1 sc (0.7) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+7125 6450 m
+gs 1 -1 sc (0.8) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+7725 6450 m
+gs 1 -1 sc (0.9) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+8325 6450 m
+gs 1 -1 sc (1.0) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2625 3525 m
+gs 1 -1 sc (V) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+3450 2850 m
+gs 1 -1 sc (L) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+5850 2850 m
+gs 1 -1 sc (H) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+7200 3525 m
+gs 1 -1 sc (minL) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1200 5700 m
+gs 1 -1 sc (worstCoverage) col0 sh gr
+% Polyline
+n 2400 1200 m
+ 2400 1800 l gs col0 s gr
+% Polyline
+ [15 45] 45 sd
+n 2400 1800 m
+ 2400 2700 l gs col0 s gr [] 0 sd
+$F2psEnd
+rs
diff --git a/ESTmapper LaTeX/mRNAfilt.fig b/ESTmapper LaTeX/mRNAfilt.fig
new file mode 100644
index 0000000..7b10507
--- /dev/null
+++ b/ESTmapper LaTeX/mRNAfilt.fig
@@ -0,0 +1,80 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter
+100.00
+Single
+-2
+1200 2
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 2400 2700 2400 6300
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 9000 6000 2100 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 3000 6300 3000 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 3600 6300 3600 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 4800 6300 4800 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 5400 6300 5400 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 6000 6300 6000 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 4200 6300 4200 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 6600 6300 6600 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 7200 6300 7200 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 7800 6300 7800 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 8400 6300 8400 6000
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 2100 5400 2400 5400
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 2100 4800 2400 4800
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 2100 4200 2400 4200
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 2100 3600 2400 3600
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 2100 3000 2400 3000
+2 1 0 3 0 7 100 0 -1 0.000 0 0 -1 0 0 4
+ 2400 5400 3600 5400 6000 3600 8400 3600
+2 1 2 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+ 2400 3600 9000 3600
+2 1 2 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+ 3600 2700 3600 5700
+2 1 2 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+ 6000 2700 6000 5700
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+ 2100 1500 2400 1500
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+ 2400 1200 2400 1800
+2 1 2 1 0 7 50 0 -1 3.000 0 0 -1 0 0 2
+ 2400 1800 2400 2700
+4 0 0 100 0 0 12 0.0000 0 135 225 1800 5475 0.0\001
+4 0 0 100 0 0 12 0.0000 0 135 225 1800 4875 0.1\001
+4 0 0 100 0 0 12 0.0000 0 135 225 1800 4275 0.2\001
+4 0 0 100 0 0 12 0.0000 0 135 225 1800 3675 0.3\001
+4 0 0 100 0 0 12 0.0000 0 135 225 1800 3075 0.4\001
+4 0 0 100 0 0 12 0.0000 0 135 225 2325 6450 0.0\001
+4 0 0 100 0 0 12 0.0000 0 135 225 2925 6450 0.1\001
+4 0 0 100 0 0 12 0.0000 0 135 225 3525 6450 0.2\001
+4 0 0 100 0 0 12 0.0000 0 135 225 4125 6450 0.3\001
+4 0 0 100 0 0 12 0.0000 0 135 225 4725 6450 0.4\001
+4 0 0 100 0 0 12 0.0000 0 135 225 5325 6450 0.5\001
+4 0 0 100 0 0 12 0.0000 0 135 225 5925 6450 0.6\001
+4 0 0 100 0 0 12 0.0000 0 135 225 6525 6450 0.7\001
+4 0 0 100 0 0 12 0.0000 0 135 225 7125 6450 0.8\001
+4 0 0 100 0 0 12 0.0000 0 135 225 7725 6450 0.9\001
+4 0 0 100 0 0 12 0.0000 0 135 225 8325 6450 1.0\001
+4 0 0 100 0 0 12 0.0000 0 135 135 2625 3525 V\001
+4 0 0 100 0 0 12 0.0000 0 135 105 3450 2850 L\001
+4 0 0 100 0 0 12 0.0000 0 135 135 5850 2850 H\001
+4 0 0 100 0 0 12 0.0000 0 135 375 7200 3525 minL\001
+4 0 0 100 0 0 12 0.0000 0 180 1155 1200 5700 worstCoverage\001
+4 0 0 100 0 0 12 0.0000 0 180 1050 1275 1800 bestCoverage\001
+4 0 0 100 0 0 12 0.0000 0 135 225 1800 1575 1.0\001
diff --git a/ESTmapper LaTeX/mrna-filter.ps b/ESTmapper LaTeX/mrna-filter.ps
new file mode 100644
index 0000000..d34c532
--- /dev/null
+++ b/ESTmapper LaTeX/mrna-filter.ps
@@ -0,0 +1,978 @@
+%!PS-Adobe-2.0
+%%Creator: dvips(k) 5.86 Copyright 1999 Radical Eye Software
+%%Title: mrna-filter.dvi
+%%Pages: 2
+%%PageOrder: Ascend
+%%BoundingBox: 0 0 596 842
+%%EndComments
+%DVIPSWebPage: (www.radicaleye.com)
+%DVIPSCommandLine: dvips -o mrna-filter.ps mrna-filter.dvi
+%DVIPSParameters: dpi=600, compressed
+%DVIPSSource: TeX output 2002.01.17:1521
+%%BeginProcSet: texc.pro
+%!
+/TeXDict 300 dict def TeXDict begin/N{def}def/B{bind def}N/S{exch}N/X{S
+N}B/A{dup}B/TR{translate}N/isls false N/vsize 11 72 mul N/hsize 8.5 72
+mul N/landplus90{false}def/@rigin{isls{[0 landplus90{1 -1}{-1 1}ifelse 0
+0 0]concat}if 72 Resolution div 72 VResolution div neg scale isls{
+landplus90{VResolution 72 div vsize mul 0 exch}{Resolution -72 div hsize
+mul 0}ifelse TR}if Resolution VResolution vsize -72 div 1 add mul TR[
+matrix currentmatrix{A A round sub abs 0.00001 lt{round}if}forall round
+exch round exch]setmatrix}N/@landscape{/isls true N}B/@manualfeed{
+statusdict/manualfeed true put}B/@copies{/#copies X}B/FMat[1 0 0 -1 0 0]
+N/FBB[0 0 0 0]N/nn 0 N/IEn 0 N/ctr 0 N/df-tail{/nn 8 dict N nn begin
+/FontType 3 N/FontMatrix fntrx N/FontBBox FBB N string/base X array
+/BitMaps X/BuildChar{CharBuilder}N/Encoding IEn N end A{/foo setfont}2
+array copy cvx N load 0 nn put/ctr 0 N[}B/sf 0 N/df{/sf 1 N/fntrx FMat N
+df-tail}B/dfs{div/sf X/fntrx[sf 0 0 sf neg 0 0]N df-tail}B/E{pop nn A
+definefont setfont}B/Cw{Cd A length 5 sub get}B/Ch{Cd A length 4 sub get
+}B/Cx{128 Cd A length 3 sub get sub}B/Cy{Cd A length 2 sub get 127 sub}
+B/Cdx{Cd A length 1 sub get}B/Ci{Cd A type/stringtype ne{ctr get/ctr ctr
+1 add N}if}B/id 0 N/rw 0 N/rc 0 N/gp 0 N/cp 0 N/G 0 N/CharBuilder{save 3
+1 roll S A/base get 2 index get S/BitMaps get S get/Cd X pop/ctr 0 N Cdx
+0 Cx Cy Ch sub Cx Cw add Cy setcachedevice Cw Ch true[1 0 0 -1 -.1 Cx
+sub Cy .1 sub]/id Ci N/rw Cw 7 add 8 idiv string N/rc 0 N/gp 0 N/cp 0 N{
+rc 0 ne{rc 1 sub/rc X rw}{G}ifelse}imagemask restore}B/G{{id gp get/gp
+gp 1 add N A 18 mod S 18 idiv pl S get exec}loop}B/adv{cp add/cp X}B
+/chg{rw cp id gp 4 index getinterval putinterval A gp add/gp X adv}B/nd{
+/cp 0 N rw exit}B/lsh{rw cp 2 copy get A 0 eq{pop 1}{A 255 eq{pop 254}{
+A A add 255 and S 1 and or}ifelse}ifelse put 1 adv}B/rsh{rw cp 2 copy
+get A 0 eq{pop 128}{A 255 eq{pop 127}{A 2 idiv S 128 and or}ifelse}
+ifelse put 1 adv}B/clr{rw cp 2 index string putinterval adv}B/set{rw cp
+fillstr 0 4 index getinterval putinterval adv}B/fillstr 18 string 0 1 17
+{2 copy 255 put pop}for N/pl[{adv 1 chg}{adv 1 chg nd}{1 add chg}{1 add
+chg nd}{adv lsh}{adv lsh nd}{adv rsh}{adv rsh nd}{1 add adv}{/rc X nd}{
+1 add set}{1 add clr}{adv 2 chg}{adv 2 chg nd}{pop nd}]A{bind pop}
+forall N/D{/cc X A type/stringtype ne{]}if nn/base get cc ctr put nn
+/BitMaps get S ctr S sf 1 ne{A A length 1 sub A 2 index S get sf div put
+}if put/ctr ctr 1 add N}B/I{cc 1 add D}B/bop{userdict/bop-hook known{
+bop-hook}if/SI save N @rigin 0 0 moveto/V matrix currentmatrix A 1 get A
+mul exch 0 get A mul add .99 lt{/QV}{/RV}ifelse load def pop pop}N/eop{
+SI restore userdict/eop-hook known{eop-hook}if showpage}N/@start{
+userdict/start-hook known{start-hook}if pop/VResolution X/Resolution X
+1000 div/DVImag X/IEn 256 array N 2 string 0 1 255{IEn S A 360 add 36 4
+index cvrs cvn put}for pop 65781.76 div/vsize X 65781.76 div/hsize X}N
+/p{show}N/RMat[1 0 0 -1 0 0]N/BDot 260 string N/Rx 0 N/Ry 0 N/V{}B/RV/v{
+/Ry X/Rx X V}B statusdict begin/product where{pop false[(Display)(NeXT)
+(LaserWriter 16/600)]{A length product length le{A length product exch 0
+exch getinterval eq{pop true exit}if}{pop}ifelse}forall}{false}ifelse
+end{{gsave TR -.1 .1 TR 1 1 scale Rx Ry false RMat{BDot}imagemask
+grestore}}{{gsave TR -.1 .1 TR Rx Ry scale 1 1 false RMat{BDot}
+imagemask grestore}}ifelse B/QV{gsave newpath transform round exch round
+exch itransform moveto Rx 0 rlineto 0 Ry neg rlineto Rx neg 0 rlineto
+fill grestore}B/a{moveto}B/delta 0 N/tail{A/delta X 0 rmoveto}B/M{S p
+delta add tail}B/b{S p tail}B/c{-4 M}B/d{-3 M}B/e{-2 M}B/f{-1 M}B/g{0 M}
+B/h{1 M}B/i{2 M}B/j{3 M}B/k{4 M}B/w{0 rmoveto}B/l{p -4 w}B/m{p -3 w}B/n{
+p -2 w}B/o{p -1 w}B/q{p 1 w}B/r{p 2 w}B/s{p 3 w}B/t{p 4 w}B/x{0 S
+rmoveto}B/y{3 2 roll p a}B/bos{/SS save N}B/eos{SS restore}B end
+
+%%EndProcSet
+%%BeginProcSet: special.pro
+%!
+TeXDict begin/SDict 200 dict N SDict begin/@SpecialDefaults{/hs 612 N
+/vs 792 N/ho 0 N/vo 0 N/hsc 1 N/vsc 1 N/ang 0 N/CLIP 0 N/rwiSeen false N
+/rhiSeen false N/letter{}N/note{}N/a4{}N/legal{}N}B/@scaleunit 100 N
+/@hscale{@scaleunit div/hsc X}B/@vscale{@scaleunit div/vsc X}B/@hsize{
+/hs X/CLIP 1 N}B/@vsize{/vs X/CLIP 1 N}B/@clip{/CLIP 2 N}B/@hoffset{/ho
+X}B/@voffset{/vo X}B/@angle{/ang X}B/@rwi{10 div/rwi X/rwiSeen true N}B
+/@rhi{10 div/rhi X/rhiSeen true N}B/@llx{/llx X}B/@lly{/lly X}B/@urx{
+/urx X}B/@ury{/ury X}B/magscale true def end/@MacSetUp{userdict/md known
+{userdict/md get type/dicttype eq{userdict begin md length 10 add md
+maxlength ge{/md md dup length 20 add dict copy def}if end md begin
+/letter{}N/note{}N/legal{}N/od{txpose 1 0 mtx defaultmatrix dtransform S
+atan/pa X newpath clippath mark{transform{itransform moveto}}{transform{
+itransform lineto}}{6 -2 roll transform 6 -2 roll transform 6 -2 roll
+transform{itransform 6 2 roll itransform 6 2 roll itransform 6 2 roll
+curveto}}{{closepath}}pathforall newpath counttomark array astore/gc xdf
+pop ct 39 0 put 10 fz 0 fs 2 F/|______Courier fnt invertflag{PaintBlack}
+if}N/txpose{pxs pys scale ppr aload pop por{noflips{pop S neg S TR pop 1
+-1 scale}if xflip yflip and{pop S neg S TR 180 rotate 1 -1 scale ppr 3
+get ppr 1 get neg sub neg ppr 2 get ppr 0 get neg sub neg TR}if xflip
+yflip not and{pop S neg S TR pop 180 rotate ppr 3 get ppr 1 get neg sub
+neg 0 TR}if yflip xflip not and{ppr 1 get neg ppr 0 get neg TR}if}{
+noflips{TR pop pop 270 rotate 1 -1 scale}if xflip yflip and{TR pop pop
+90 rotate 1 -1 scale ppr 3 get ppr 1 get neg sub neg ppr 2 get ppr 0 get
+neg sub neg TR}if xflip yflip not and{TR pop pop 90 rotate ppr 3 get ppr
+1 get neg sub neg 0 TR}if yflip xflip not and{TR pop pop 270 rotate ppr
+2 get ppr 0 get neg sub neg 0 S TR}if}ifelse scaleby96{ppr aload pop 4
+-1 roll add 2 div 3 1 roll add 2 div 2 copy TR .96 dup scale neg S neg S
+TR}if}N/cp{pop pop showpage pm restore}N end}if}if}N/normalscale{
+Resolution 72 div VResolution 72 div neg scale magscale{DVImag dup scale
+}if 0 setgray}N/psfts{S 65781.76 div N}N/startTexFig{/psf$SavedState
+save N userdict maxlength dict begin/magscale true def normalscale
+currentpoint TR/psf$ury psfts/psf$urx psfts/psf$lly psfts/psf$llx psfts
+/psf$y psfts/psf$x psfts currentpoint/psf$cy X/psf$cx X/psf$sx psf$x
+psf$urx psf$llx sub div N/psf$sy psf$y psf$ury psf$lly sub div N psf$sx
+psf$sy scale psf$cx psf$sx div psf$llx sub psf$cy psf$sy div psf$ury sub
+TR/showpage{}N/erasepage{}N/copypage{}N/p 3 def @MacSetUp}N/doclip{
+psf$llx psf$lly psf$urx psf$ury currentpoint 6 2 roll newpath 4 copy 4 2
+roll moveto 6 -1 roll S lineto S lineto S lineto closepath clip newpath
+moveto}N/endTexFig{end psf$SavedState restore}N/@beginspecial{SDict
+begin/SpecialSave save N gsave normalscale currentpoint TR
+ at SpecialDefaults count/ocount X/dcount countdictstack N}N/@setspecial{
+CLIP 1 eq{newpath 0 0 moveto hs 0 rlineto 0 vs rlineto hs neg 0 rlineto
+closepath clip}if ho vo TR hsc vsc scale ang rotate rwiSeen{rwi urx llx
+sub div rhiSeen{rhi ury lly sub div}{dup}ifelse scale llx neg lly neg TR
+}{rhiSeen{rhi ury lly sub div dup scale llx neg lly neg TR}if}ifelse
+CLIP 2 eq{newpath llx lly moveto urx lly lineto urx ury lineto llx ury
+lineto closepath clip}if/showpage{}N/erasepage{}N/copypage{}N newpath}N
+/@endspecial{count ocount sub{pop}repeat countdictstack dcount sub{end}
+repeat grestore SpecialSave restore end}N/@defspecial{SDict begin}N
+/@fedspecial{end}B/li{lineto}B/rl{rlineto}B/rc{rcurveto}B/np{/SaveX
+currentpoint/SaveY X N 1 setlinecap newpath}N/st{stroke SaveX SaveY
+moveto}N/fil{fill SaveX SaveY moveto}N/ellipse{/endangle X/startangle X
+/yrad X/xrad X/savematrix matrix currentmatrix N TR xrad yrad scale 0 0
+1 startangle endangle arc savematrix setmatrix}N end
+
+%%EndProcSet
+TeXDict begin 39158280 55380996 1000 600 600 (mrna-filter.dvi)
+ at start
+%DVIPSBitmapFont: Fa cmsy7 7 1
+/Fa 1 1 df<B712FEA327037A8F34>0 D E
+%EndDVIPSBitmapFont
+%DVIPSBitmapFont: Fb cmex10 10 4
+/Fb 4 63 df<EC01F01407140F143F147F903801FFC0491380491300495A495A495A495A
+5C495A485B5A91C7FC485AA2485AA2485AA2123F5BA2127F5BA412FF5BB3B3A71C4B607E
+4A>56 D<EAFFC0B3B3A77F127FA47F123FA27F121FA26C7EA26C7EA26C7E807E6C7F6D7E
+806D7E6D7E6D7E6D7E6D13806D13C09038007FF0143F140F140714011C4B60804A>58
+D<EC1FF8B3B3A7143F15F0A4EC7FE0A315C014FFA2491380A215005B5C1307495A5C131F
+495A5C495A495A4890C7FC485A485A485A485AEA7FE0EAFF8090C8FC12FCB4FC7FEA7FE0
+EA1FF06C7E6C7E6C7E6C7E6C7F6D7E6D7E806D7E130F806D7E1303807F1580A26D13C0A2
+147F15E0A3EC3FF0A415F8141FB3B3A71D9773804A>60 D<EAFFC0B3A90A1B60804A>62
+D E
+%EndDVIPSBitmapFont
+%DVIPSBitmapFont: Fc cmsy10 10 4
+/Fc 4 21 df<007FB81280B912C0A26C17803204799641>0 D<121C127FEAFF80A5EA7F
+00121C0909799917>I<EB0380497EA7397803803C00FC147E00FE14FE397F8383FC393F
+C387F8390FE38FE03903FBBF803900FFFE00EB3FF8EB0FE0A2EB3FF8EBFFFE3903FBBF80
+390FE38FE0393FC387F8397F8383FC39FE0380FE00FC147E0078143C390007C000A76D5A
+1F247BA62A>3 D<EF0180EF07C0171FEF7F80933801FF00EE07FCEE1FF0EE7FC04B48C7
+FCED07FCED1FF0ED7FC04A48C8FCEC07FCEC1FF0EC7FC04948C9FCEB07FCEB1FF0EB7FC0
+4848CAFCEA07FCEA3FF0EA7FC048CBFC5AEA7F80EA3FE0EA0FF8EA03FEC66C7EEB3FE0EB
+0FF8EB03FE903800FF80EC3FE0EC0FF8EC03FE913800FF80ED3FE0ED0FF8ED03FE923800
+FF80EE3FE0EE0FF8EE03FE933800FF80EF3FC0170FEF038094C7FCAE007FB81280B912C0
+A26C1780324479B441>20 D E
+%EndDVIPSBitmapFont
+%DVIPSBitmapFont: Fd cmmi7 7 5
+/Fd 5 109 df<903B3FFFF01FFFF8A2D901FCC7EAFE004A5CA2010314015F5CA2010714
+035F5CA2010F14075F5CA2011F140F91B65AA2913880000F013F141F5F91C7FCA249143F
+94C7FC137EA201FE5C167E5BA2000115FE5E5BA200031401B539C07FFFE0A235287DA736
+>72 D<90383FFFF8A2D901FCC7FC5CA21303A25CA21307A25CA2130FA25CA2131FA25CA2
+133FA291C8FCA249141C1618137E163801FE1430167049146016E000011401ED03C04913
+07ED0F800003147FB7FC160026287DA72E>76 D<EB0FE0EB3FF8EBF81C3801E00E380380
+3E0007137EEA0F00001E137C123E003C1300127CA25AA45AA31404140C0078131814306C
+13E0381E07C0380FFF00EA03F8171B7C991E>99 D<133EEA07FEA2EA007CA213FCA25BA2
+1201A25BA2120314FCEBE3FF9038EF0780D807FC13C0EBF00313E0A2EA0FC014071380A2
+121FEC0F801300A248EB1F00A2003E1406143E127EEC7C0C127C151800FCEB3C30157048
+EB1FE00070EB0F801F297CA727>104 D<137CEA0FFCA2EA00F8A21201A213F0A21203A2
+13E0A21207A213C0A2120FA21380A2121FA21300A25AA2123EA2127EA2EA7C18A3EAF830
+A21320EA786013C0EA3F80EA0F000E297EA715>108 D E
+%EndDVIPSBitmapFont
+%DVIPSBitmapFont: Fe cmmi10 10 27
+/Fe 27 120 df<121C127FEAFF80A5EA7F00121C0909798817>58
+D<121C127FEAFF80A213C0A3127F121C1200A412011380A2120313005A1206120E5A5A5A
+12600A19798817>I<EF0380EF0FC0173FEFFF80933803FE00EE0FF8EE3FE0EEFF80DB03
+FEC7FCED0FF8ED3FE0EDFF80DA03FEC8FCEC0FF8EC3FE0ECFF80D903FEC9FCEB0FF8EB3F
+E0EBFF80D803FECAFCEA0FF8EA3FE0EA7F8000FECBFCA2EA7F80EA3FE0EA0FF8EA03FEC6
+6C7EEB3FE0EB0FF8EB03FE903800FF80EC3FE0EC0FF8EC03FE913800FF80ED3FE0ED0FF8
+ED03FE923800FF80EE3FE0EE0FF8EE03FE933800FF80EF3FC0170FEF0380323279AD41>
+I<0103B77E4916F018FC903B0007F80003FE4BEB00FFF07F80020FED3FC0181F4B15E0A2
+141FA25DA2143F19C04B143F1980027F157F190092C812FE4D5A4A4A5AEF0FF04AEC1FC0
+05FFC7FC49B612FC5F02FCC7B4FCEF3FC00103ED0FE0717E5C717E1307844A1401A2130F
+17035CA2131F4D5A5C4D5A133F4D5A4A4A5A4D5A017F4BC7FC4C5A91C7EA07FC49EC3FF0
+B812C094C8FC16F83B397DB83F>66 D<9339FF8001C0030F13E0037F9038F80380913A01
+FF807E07913A07F8000F0FDA1FE0EB079FDA3F80903803BF0002FFC76CB4FCD901FC8049
+5A4948157E495A495A4948153E017F163C49C9FC5B1201484816385B1207485A1830121F
+4993C7FCA2485AA3127F5BA312FF90CCFCA41703A25F1706A26C160E170C171C5F6C7E5F
+001F5E6D4A5A6C6C4A5A16076C6C020EC8FC6C6C143C6C6C5C6CB4495A90393FE00FC001
+0FB5C9FC010313FC9038007FC03A3D7CBA3B>I<0103B5D8F803B512F8495DA290260007
+F8C73807F8004B5DA2020F150F615DA2021F151F615DA2023F153F615DA2027F157F96C7
+FC92C8FCA24A5D605CA249B7FC60A202FCC7120101031503605CA201071507605CA2010F
+150F605CA2011F151F605CA2013F153F605CA2017F157F95C8FC91C8FC496C4A7EB690B6
+FCA345397DB845>72 D<0103B6FC5B5E90260007FCC8FC5D5D140FA25DA2141FA25DA214
+3FA25DA2147FA292C9FCA25CA25CA21301A25CA21303A25CA2130718404A15C0A2010F15
+0118804A1403A2011F16005F4A1406170E013F151E171C4A143C177C017F5D160391C712
+0F49EC7FF0B8FCA25F32397DB839>76 D<902603FFF893383FFF80496081D900079438FF
+80000206DC01BFC7FCA2020E4C5A1A7E020C1606190CDA1C7E16FE4F5A02181630A20238
+166162023016C1F00181DA703F158395380303F002601506A202E0ED0C076202C0151818
+3001016D6C140F06605B028015C0A20103923801801FDD03005B140092380FC00649173F
+4D91C8FC01065DA2010E4B5B4D137E130C6F6C5A011C17FEDCE1805B011802E3C7FCA201
+3802E6130104EC5C1330ED03F8017016034C5C01F05CD807FC4C7EB500E0D9C007B512F0
+1680150151397CB851>I<267FFFFC91383FFFC0B55DA2000390C83807FC006C48ED03E0
+6060000094C7FC5F17065FA25F6D5DA26D5D17E05F4C5AA24CC8FC6E1306A2013F5C161C
+16185EA25E6E5BA2011F495A150393C9FC1506A25D6E5AA2010F5B157015605DA2ECE180
+02E3CAFC14F3EB07F614FE5C5CA25C5CA26D5AA25C91CBFC3A3B7CB830>86
+D<147E903803FF8090390FC1C38090391F00EFC0017E137F49133F485A4848EB1F801207
+5B000F143F48481400A2485A5D007F147E90C7FCA215FE485C5AA214015D48150CA21403
+EDF01C16181407007C1538007E010F1330003E131F027B13706C01E113E03A0F83C0F9C0
+3A03FF007F80D800FCEB1F0026267DA42C>97 D<133FEA1FFFA3C67E137EA313FE5BA312
+015BA312035BA31207EBE0FCEBE3FF9038E707C0390FFE03E09038F801F001F013F8EBE0
+00485A15FC5BA2123F90C7FCA214015A127EA2140312FE4814F8A2140715F05AEC0FE0A2
+15C0EC1F80143F00781400007C137E5C383C01F86C485A380F07C06CB4C7FCEA01FC1E3B
+7CB924>I<EC3FC0903801FFF0903807E03C90380F800E90383F0007017E131F49137F48
+4813FF485A485A120F4913FE001F143848481300A2127F90C8FCA35A5AA45AA315031507
+007E1406150E003E143C003F14706C14E0390F8007C03907C03F003801FFF838003FC020
+267DA424>I<163FED1FFFA3ED007F167EA216FEA216FCA21501A216F8A21503A216F0A2
+1507A2027E13E0903803FF8790380FC1CF90381F00EF017EEB7FC049133F485A4848131F
+000715805B000F143F485A1600485A5D127F90C7127EA215FE5A485CA21401A248ECF80C
+A21403161CEDF0181407007C1538007E010F1330003E131F027B13706C01E113E03A0F83
+C0F9C03A03FF007F80D800FCEB1F00283B7DB92B>I<EC3FC0903801FFF0903807E07890
+381F801C90387E001E49130E485A485A1207485A49131E001F141C153C484813F8EC03E0
+007FEB3FC09038FFFE0014E090C8FC5A5AA7007E140315071506003E140E153C6C14706C
+6C13E0EC07C03903E03F003801FFF838003FC020267DA427>I<EC07E0EC1FF891387C1C
+38903901F80EFC903803F007903807E003EB0FC090381F8001D93F0013F85B017E130313
+FE16F0485A150712034914E0A2150F12074914C0A2151FA2491480A2153FA2160000035C
+6D5B00015B4A5A3900F8077E90387C1EFEEB1FF8903807E0FC90C7FC1401A25DA2140300
+1E5C123F387F80075D00FF495A49485A4849C7FC007C137E383C01F8381FFFE0000390C8
+FC26367FA428>103 D<EB03F0EA01FFA3EA00075CA3130F5CA3131F5CA3133F91C9FCA3
+5B90387E03F8EC0FFF91383C0F809039FEF007C0D9FDC07FEBFF80EC0003485A5BA24913
+0712035BA2150F00075D5BA2151F000F5D5B153F93C7FC121F4990387F0180157EEDFE03
+003F02FC130090C7FC5EEDF80648150E007E150C161C5E00FEEC787048EC3FE00038EC0F
+80293B7CB930>I<14E0EB03F8A21307A314F0EB01C090C7FCAB13F8EA03FEEA070F000E
+1380121C121812381230EA701F1260133F00E0130012C05BEA007EA213FE5B1201A25B12
+035BA20007131813E01438000F133013C01470EB806014E014C01381EB838038078700EA
+03FEEA00F815397EB71D>I<D803E0017F14FE3D07F801FFE003FFC03D0E3C0781F00F03
+E03D1C3E1E00F83C01F026383F38D9FC707F00304914E04A90387DC000007049EB7F8000
+604991C7FCA200E090C700FE1301485A017E5CA200000201140301FE5F495CA203031407
+000160495C180F03075D1203494A011F13601980030F023F13E00007F000C0495C190103
+1F023E1380000F1803494A150061033F150E001FEF1E1C4991C7EA0FF80007C7000EEC03
+E043267EA449>109 D<D803E0137F3A07F801FFE03A0E3C0781F03A1C3E1E00F826383F
+387F00305B4A137C00705B00605BA200E090C712FC485A137EA20000140101FE5C5BA215
+0300015D5B15075E120349010F133016C0031F13700007ED80605B17E0EE00C0000F1501
+4915801603EE0700001FEC0F0E49EB07FC0007C7EA01F02C267EA432>I<EC1FC0ECFFF8
+903807E07E90380F801F90393F000F80017E14C0491307484814E0485A4848EB03F0120F
+5B121F48481307A2127F90C7FCA2150F5A4815E0A2151F16C0A248EC3F8016005D157E00
+7E5C4A5A003E495A003F495A6C495A6C6C48C7FC3807E07E3801FFF038003F8024267DA4
+28>I<90390F8003F090391FE00FFC903939F03C1F903A70F8700F80903AE0FDE007C090
+38C0FF80030013E00001491303018015F05CEA038113015CA2D800031407A25CA2010714
+0FA24A14E0A2010F141F17C05CEE3F80131FEE7F004A137E16FE013F5C6E485A4B5A6E48
+5A90397F700F80DA383FC7FC90387E1FFCEC07E001FEC9FCA25BA21201A25BA21203A25B
+1207B512C0A32C3583A42A>I<3903E001F83907F807FE390E3C1E07391C3E381F3A183F
+703F800038EBE07F0030EBC0FF00705B00601500EC007E153CD8E07F90C7FCEAC07EA212
+0013FE5BA312015BA312035BA312075BA3120F5BA3121F5B0007C9FC21267EA425>114
+D<14FF010313C090380F80F090383E00380178131C153C4913FC0001130113E0A33903F0
+00F06D13007F3801FFE014FC14FF6C14806D13C0011F13E013039038003FF01407140300
+1E1301127FA24814E0A348EB03C012F800E0EB07800070EB0F006C133E001E13F83807FF
+E0000190C7FC1E267CA427>I<EB01C0497E1307A4130F5CA3131F5CA3133F91C7FC007F
+B51280A2B6FCD8007EC7FCA313FE5BA312015BA312035BA312075BA3120FEBC006A2140E
+001F130CEB801C141814385C146014E0380F81C038078780D803FEC7FCEA00F819357EB3
+1E>I<13F8D803FE1438D8070F147C000E6D13FC121C1218003814011230D8701F5C1260
+1503EAE03F00C001005B5BD8007E1307A201FE5C5B150F1201495CA2151F120349EC80C0
+A2153F1681EE0180A2ED7F0303FF130012014A5B3A00F8079F0E90397C0E0F1C90393FFC
+07F8903907F001F02A267EA430>I<01F8EB03C0D803FEEB07E0D8070F130F000E018013
+F0121C12180038140700301403D8701F130112601500D8E03F14E000C090C7FC5BEA007E
+16C013FE5B1501000115805B150316001203495B1506150E150C151C151815385D00015C
+6D485A6C6C485AD97E0FC7FCEB1FFEEB07F024267EA428>I<01F816F0D803FE9138E001
+F8D8070F903801F003000ED9800314FC121C12180038020713010030EDE000D8701F167C
+1260030F143CD8E03F163800C001005B5BD8007E131F183001FE5C5B033F147000011760
+4991C7FCA218E000034A14C049137E17011880170318005F03FE1306170E000101015C01
+F801BF5B3B00FC039F8070903A7E0F0FC0E0903A1FFC03FFC0902703F0007FC7FC36267E
+A43B>I E
+%EndDVIPSBitmapFont
+%DVIPSBitmapFont: Ff cmr10 10 59
+/Ff 59 123 df<EC0FF8EC7FFE903901F80780903907E001C090391F8000E090383F0007
+017E497EA25BA2485A6F5AED018092C8FCA9ED03F0B7FCA33901F8000F1503B3AA486C49
+7E267FFFE0B512C0A32A3B7FBA2E>12 D<121C127FEAFF80A213C0A3127F121C1200A412
+011380A2120313005A1206120E5A5A5A12600A1979B917>39 D<146014E0EB01C0EB0380
+EB0700130E131E5B5BA25B485AA2485AA212075B120F90C7FCA25A121EA2123EA35AA65A
+B2127CA67EA3121EA2121F7EA27F12077F1203A26C7EA26C7E1378A27F7F130E7FEB0380
+EB01C0EB00E01460135278BD20>I<12C07E12707E7E7E120F6C7E6C7EA26C7E6C7EA213
+78A2137C133C133E131EA2131F7FA21480A3EB07C0A6EB03E0B2EB07C0A6EB0F80A31400
+A25B131EA2133E133C137C1378A25BA2485A485AA2485A48C7FC120E5A5A5A5A5A13527C
+BD20>I<EB0380497EA7397803803C00FC147E00FE14FE397F8383FC393FC387F8390FE3
+8FE03903FBBF803900FFFE00EB3FF8EB0FE0A2EB3FF8EBFFFE3903FBBF80390FE38FE039
+3FC387F8397F8383FC39FE0380FE00FC147E0078143C390007C000A76D5A1F247BBD2A>
+I<121C127FEAFF80A213C0A3127F121C1200A412011380A2120313005A1206120E5A5A5A
+12600A19798817>44 D<B512FCA516057F941C>I<121C127FEAFF80A5EA7F00121C0909
+798817>I<150C151E153EA2153C157CA2157815F8A215F01401A215E01403A215C01407
+A21580140FA215005CA2141E143EA2143C147CA2147814F8A25C1301A25C1303A2495AA2
+5C130FA291C7FC5BA2131E133EA2133C137CA2137813F8A25B1201A25B1203A25B1207A2
+5B120FA290C8FC5AA2121E123EA2123C127CA2127812F8A25A12601F537BBD2A>I<EB03
+F8EB1FFF90387E0FC09038F803E03901E000F0484813780007147C48487FA248C77EA248
+1580A3007EEC0FC0A600FE15E0B3007E15C0A4007F141F6C1580A36C15006D5B000F143E
+A26C6C5B6C6C5B6C6C485A6C6C485A90387E0FC0D91FFFC7FCEB03F8233A7DB72A>I<EB
+01C013031307131F13FFB5FCA2131F1200B3B3A8497E007FB512F0A31C3879B72A>I<EB
+0FF0EB7FFE48B57E3903E03FE0390F000FF0000E6D7E486D7E486D7E123000706D7E1260
+12FCB4EC7F807FA56CC7FC121CC8FCEDFF00A34A5A5D14035D4A5A5D140F4A5A4A5A92C7
+FC147C5C495A495A495A495A91C8FC011EEB01805B5B49130348481400485A485A000EC7
+5A000FB6FC5A5A485CB6FCA321387CB72A>I<EB07F8EB3FFF4913C03901F80FF03903C0
+07F848486C7E380E0001000F80381FE0006D7FA56C5A6C5AC85A1401A25D4A5AA24A5A5D
+EC0F80027EC7FCEB1FFCECFF809038000FE06E7EEC01FC816E7EED7F80A216C0A2153F16
+E0A2121EEA7F80487EA416C049137F007F1580007EC7FC0070ECFF006C495A121E390F80
+03F83907F00FF00001B512C06C6C90C7FCEB0FF8233A7DB72A>I<0006140CD80780133C
+9038F003F890B5FC5D5D158092C7FC14FC38067FE090C9FCABEB07F8EB3FFE9038780F80
+3907E007E090388003F0496C7E12066E7EC87EA28181A21680A4123E127F487EA490C713
+00485C12E000605C12700030495A00385C6C1303001E495A6C6C485A3907E03F800001B5
+C7FC38007FFCEB1FE0213A7CB72A>53 D<EC3FC0903801FFF0010713FC90380FE03E9038
+3F800790387E001F49EB3F804848137F485AA2485A000FEC3F0049131E001F91C7FCA248
+5AA3127F90C9FCEB01FC903807FF8039FF1E07E090383801F0496C7E01607F01E0137E49
+7FA249148016C0151FA290C713E0A57EA56C7E16C0A2121FED3F807F000F15006C6C5B15
+FE6C6C5B6C6C485A3900FE07F090383FFFC06D90C7FCEB03FC233A7DB72A>I<12301238
+123E003FB612E0A316C05A168016000070C712060060140E5D151800E01438485C5D5DC7
+12014A5A92C7FC5C140E140C141C5CA25CA214F0495AA21303A25C1307A2130FA3495AA3
+133FA5137FA96DC8FC131E233B7BB82A>I<121C127FEAFF80A5EA7F00121CC7FCB2121C
+127FEAFF80A5EA7F00121C092479A317>58 D<121C127FEAFF80A5EA7F00121CC7FCA812
+1CAB123EAB127FABEAFF80A8EA7F00121C093C79A917>60 D<007FB812F8B912FCA26C17
+F8CCFCAE007FB812F8B912FCA26C17F836167B9F41>I<130EEB3F80497EA56D5A010EC7
+FC90C8FCA81306A4130E130CA6131CA35BA213785BA21201485A1207485A485A123F48C8
+FCA200FE14F0EC01F8EC03FCA41401EC00F8007E1438007F14706C14E0391F8003C0390F
+C01F003803FFFC38007FE01E3B7CA927>I<1538A3157CA315FEA34A7EA34A6C7EA20207
+7FEC063FA2020E7FEC0C1FA2021C7FEC180FA202387FEC3007A202707FEC6003A202C07F
+1501A2D901807F81A249C77F167FA20106810107B6FCA24981010CC7121FA2496E7EA349
+6E7EA3496E7EA213E0707E1201486C81D80FFC02071380B56C90B512FEA3373C7DBB3E>
+65 D<B712E016FC16FF0001903980007FC06C90C7EA1FE0707E707E707EA2707EA283A7
+5F16035F4C5A4C5A4C5A4C5AEEFF8091B500FCC7FCA291C7EA7F80EE1FE0EE07F0707E70
+7E83707EA21880177F18C0A7188017FFA24C13005F16034C5AEE1FF8486DEB7FF0B812C0
+94C7FC16F832397DB83B>I<913A01FF800180020FEBE003027F13F8903A01FF807E0790
+3A03FC000F0FD90FF0EB039F4948EB01DFD93F80EB00FF49C8127F01FE153F1201484815
+1F4848150FA248481507A2485A1703123F5B007F1601A35B00FF93C7FCAD127F6DED0180
+A3123F7F001F160318006C7E5F6C7E17066C6C150E6C6C5D00001618017F15386D6C5CD9
+1FE05C6D6CEB03C0D903FCEB0F80902701FF803FC7FC9039007FFFFC020F13F002011380
+313D7BBA3C>I<B712C016F816FE000190398001FF806C90C7EA3FE0EE0FF0EE03F8707E
+707E177FA2EF3F8018C0171F18E0170F18F0A3EF07F8A418FCAC18F8A4EF0FF0A218E0A2
+171F18C0EF3F80A2EF7F0017FE4C5A4C5AEE0FF0EE3FE0486DEBFF80B8C7FC16F816C036
+397DB83F>I<B812F8A30001903880001F6C90C71201EE00FC177C173C171CA2170CA417
+0E1706A2ED0180A21700A41503A21507151F91B5FCA3EC001F15071503A21501A692C8FC
+AD4813C0B612C0A32F397DB836>70 D<B648B512FEA30001902680000313006C90C76C5A
+B3A491B6FCA391C71201B3A6486D497EB648B512FEA337397DB83E>72
+D<B612C0A3C6EBC0006D5AB3B3AD497EB612C0A31A397EB81E>I<013FB512E0A3903900
+1FFC00EC07F8B3B3A3123FEA7F80EAFFC0A44A5A1380D87F005B0070131F6C5C6C495A6C
+49C7FC380781FC3801FFF038007F80233B7DB82B>I<B612E0A3000101C0C8FC6C90C9FC
+B3AD1718A517381730A31770A317F0A216011603160FEE1FE0486D13FFB8FCA32D397DB8
+34>76 D<B5933807FFF86E5DA20001F0FC002600DFC0ED1BF8A2D9CFE01533A3D9C7F015
+63A3D9C3F815C3A2D9C1FCEC0183A3D9C0FEEC0303A2027F1406A36E6C130CA36E6C1318
+A26E6C1330A36E6C1360A26E6C13C0A3913901FC0180A3913900FE0300A2ED7F06A3ED3F
+8CA2ED1FD8A3ED0FF0A3486C6D5A487ED80FFC6D48497EB500C00203B512F8A2ED018045
+397DB84C>I<B5913807FFFE8080C69238007FE06EEC1F80D9DFF0EC0F001706EBCFF8EB
+C7FCA2EBC3FEEBC1FFA201C07F6E7EA26E7E6E7E81140F6E7E8114036E7E168080ED7FC0
+16E0153FED1FF0ED0FF8A2ED07FCED03FEA2ED01FF6F1386A2EE7FC6EE3FE6A2EE1FF6EE
+0FFEA216071603A216011600A2177E486C153E487ED80FFC151EB500C0140EA217063739
+7DB83E>I<B612FEEDFFE016F8000190388007FE6C90C76C7EEE3FC0707E707E707EA270
+7EA283A65FA24C5AA24C5A4C5AEE3F8004FFC8FCED07FC91B512E05E9138000FF0ED03F8
+ED00FE82707E707EA2161F83A583A6F00180A217F8160F1803486D01071400B66D6C5A04
+011306933800FE0ECAEA3FFCEF07F0393B7DB83D>82 D<D90FF813C090383FFE0190B512
+813903F807E33907E000F74848137F4848133F48C7121F003E140F007E1407A2007C1403
+12FC1501A36C1400A37E6D14006C7E7F13F86CB47E6C13F8ECFF806C14E06C14F86C14FE
+C680013F1480010714C0EB007F020713E0EC007FED3FF0151F150FED07F8A200C01403A2
+1501A37EA216F07E15036C15E06C14076C15C06C140F6DEB1F80D8FBF0EB3F00D8F0FE13
+FE39E03FFFF8010F13E0D8C00190C7FC253D7CBA2E>I<003FB812E0A3D9C003EB001F27
+3E0001FE130348EE01F00078160000701770A300601730A400E01738481718A4C71600B3
+B0913807FF80011FB612E0A335397DB83C>I<B5D8FC07B5D8F001B5FCA3000790278000
+1FFEC7EA1FF86C48C7D80FF8EC07E000010307ED03C01B807F6C6F6C1500A26E5F017F6E
+6C1406A280013F4A6C5CA280011F4A6D5BEE067FA26D6C010E6D5BEE0C3FA26D6C011C6D
+5BEE181FA26D6C6F5BEE300FA26D6C6F485AEE6007A26D6C4CC7FC9338C003FCA203805D
+913B7F818001FE06A203C1150EDA3FC3C7EAFF0CA203E3151CDA1FE6EC7F98A215F6DA0F
+FCEC3FF0A302075E4B141FA202035E4B140FA202015E4B1407A2020093C8FC4B80503B7E
+B855>87 D<EB1FE0EBFFFC3803E03F3907000F80390F8007E0486C6C7E13E06E7EA26E7E
+6C5A6C5AC8FCA4147FEB07FFEB3FE0EBFE00EA03F8EA0FF0EA1FC0123F485A90C7FC160C
+12FEA31401A26C13036CEB077C903980063E18383FC01E3A0FE0781FF03A03FFF00FE03A
+007F8007C026277DA52A>97 D<EA03F012FFA3120F1203B0EC1FE0EC7FF89038F1E03E90
+39F3801F809039F7000FC001FEEB07E049EB03F049EB01F85BED00FCA216FEA2167E167F
+AA167E16FEA216FC15016D14F8ED03F07F01EEEB07E001C6EB0FC09039C7801F00903881
+E07E903800FFF8C7EA1FC0283B7EB92E>I<EB03FC90381FFF8090387E03E03901F80070
+484813F83907E001FC380FC003A2EA1F80123F90380001F848EB00F01500A2127E12FEAA
+127E127FA26C14067F001F140E6D130C000F141C6C6C13386C6C13706C6C13E039007C07
+C090381FFF00EB07F81F277DA525>I<ED0FC0EC03FFA3EC003F150FB0EB03F8EB1FFF90
+387E078F9038F801EF3903F0007F4848133F4848131FA24848130F123F90C7FC5AA2127E
+12FEAA127E127FA27EA26C6C131FA26C6C133F6C6C137F6C6CEBEFF03A01F801CFFF3900
+7C078F90381FFE0FD907F813C0283B7DB92E>I<EB07F8EB1FFF90387C0FC03901F803E0
+3903F001F0D807E013F8380FC0004848137CA248C7127E153E5A153F127E12FEA3B7FCA2
+48C8FCA5127EA2127FA26C14037F001F14076C6C13060007140E6D131CD801F013386C6C
+137090387E03E090381FFF80903803FC0020277EA525>I<147E903803FF8090380FC1E0
+EB1F8790383F0FF0137EA213FCA23901F803C091C7FCADB512FCA3D801F8C7FCB3AB487E
+387FFFF8A31C3B7FBA19>I<ED03F090390FF00FF890393FFC3C3C9039F81F707C3901F0
+0FE03903E007C03A07C003E010000FECF000A248486C7EA86C6C485AA200075C6C6C485A
+6D485A6D48C7FC38073FFC38060FF0000EC9FCA4120FA213C06CB512C015F86C14FE6CEC
+FF804815C03A0F80007FE048C7EA0FF0003E140348140116F8481400A56C1401007C15F0
+6CEC03E0003F1407D80F80EB0F80D807E0EB3F003901FC01FC39007FFFF0010790C7FC26
+387EA52A>I<EA03F012FFA3120F1203B0EC0FF0EC3FFCECF03F9039F1C01F809039F380
+0FC0EBF70013FE496D7EA25BA35BB3A3486C497EB500C1B51280A3293A7EB92E>I<EA03
+80EA0FE0487EA56C5AEA0380C8FCAAEA03F012FFA312071203B3AA487EB512C0A312387E
+B717>I<EA03F012FFA3120F1203B3B3AD487EB512C0A3123A7EB917>108
+D<2703F00FF0EB1FE000FFD93FFCEB7FF8913AF03F01E07E903BF1C01F83803F3D0FF380
+0FC7001F802603F70013CE01FE14DC49D907F8EB0FC0A2495CA3495CB3A3486C496CEB1F
+E0B500C1B50083B5FCA340257EA445>I<3903F00FF000FFEB3FFCECF03F9039F1C01F80
+3A0FF3800FC03803F70013FE496D7EA25BA35BB3A3486C497EB500C1B51280A329257EA4
+2E>I<EB03FE90380FFF8090383E03E09038F800F84848137C48487F48487F4848EB0F80
+001F15C090C712074815E0A2007EEC03F0A400FE15F8A9007E15F0A2007F14076C15E0A2
+6C6CEB0FC0000F15806D131F6C6CEB3F006C6C137EC66C13F890387E03F090381FFFC0D9
+03FEC7FC25277EA52A>I<3903F01FE000FFEB7FF89038F1E07E9039F3801F803A0FF700
+0FC0D803FEEB07E049EB03F04914F849130116FC150016FEA3167FAA16FEA3ED01FCA26D
+EB03F816F06D13076DEB0FE001F614C09039F7803F009038F1E07E9038F0FFF8EC1FC091
+C8FCAB487EB512C0A328357EA42E>I<D903F813C090381FFE0190387E07819038FC01C3
+3903F000E3000714774848133749133F001F141F485A150F48C7FCA312FEAA127FA37E6D
+131F121F6D133F120F6C6C137F6C6C13EF3901F801CF39007E078F90381FFE0FEB07F890
+C7FCABED1FE00203B5FCA328357DA42C>I<3807E01F00FFEB7FC09038E1E3E09038E387
+F0380FE707EA03E613EE9038EC03E09038FC0080491300A45BB3A2487EB512F0A31C257E
+A421>I<EBFF03000313E7380F80FF381E003F487F487F00707F12F0A2807EA27EB490C7
+FCEA7FE013FF6C13E06C13F86C7F00037FC67F01071380EB007F141F00C0EB0FC01407A2
+6C1303A37E15806C13077EEC0F00B4131E38F3C07C38E1FFF038C03F801A277DA521>I<
+1318A51338A31378A313F8120112031207001FB5FCB6FCA2D801F8C7FCB215C0A93800FC
+011580EB7C03017E13006D5AEB0FFEEB01F81A347FB220>I<D803F0EB07E000FFEB01FF
+A3000FEB001F00031407B3A4150FA3151F12016D133F0000EC77F86D9038E7FF8090383F
+03C790381FFF87903A03FC07E00029267EA42E>I<B538803FFEA33A0FF8000FF06C48EB
+07E00003EC03C06D148000011500A26C6C1306A26D130E017E130CA26D5BA2EC8038011F
+1330A26D6C5AA214E001075BA2903803F180A3D901FBC7FCA214FF6D5AA2147CA31438A2
+27257EA32C>I<B53A1FFFE03FFEA3260FF8009038000FF86C48017EEB03E018C0000302
+3EEB0180A26C6C013FEB0300A36C6CEC8006156FA2017E9038EFC00C15C7A2D93F016D5A
+15830281EBF038D91F831430150102C3EBF87090260FC6001360A2D907E66D5A02EC137C
+A2D903FCEB7F804A133FA2010192C7FC4A7FA20100141E4A130E0260130C37257EA33C>
+I<B538807FFFA33A03FE003FF00001EC1F80000092C7FC017E131C6D13186D6C5AECC070
+010F5B6D6C5AECF180EB03FB6DB4C8FC6D5AA2147F804A7E8114CF903801C7E090380383
+F090380703F8EB0601496C7E011C137E49137F01787F496D7E486C80000FEC3FF0D8FFFE
+90B51280A329247FA32C>I<B538803FFEA33A0FF8000FF06C48EB07C00003EC03806C7E
+16007F00001406A2017E5BA2137F6D5BA26D6C5AA2ECC070010F1360A26D6C5AA214F101
+035BA2D901FBC7FCA214FF6D5AA2147CA31438A21430A214701460A25CA2EA7C0100FE5B
+130391C8FC1306EAFC0EEA701C6C5AEA1FF0EA0FC027357EA32C>I<003FB512FCA2EB80
+03D83E0013F8003CEB07F00038EB0FE012300070EB1FC0EC3F800060137F150014FE495A
+A2C6485A495AA2495A495A495AA290387F000613FEA2485A485A0007140E5B4848130C48
+48131CA24848133C48C7127C48EB03FC90B5FCA21F247EA325>I
+E
+%EndDVIPSBitmapFont
+%DVIPSBitmapFont: Fg cmbx12 14.4 14
+/Fg 14 117 df<157815FC14031407141F14FF130F0007B5FCB6FCA2147F13F0EAF800C7
+FCB3B3B3A6007FB712FEA52F4E76CD43>49 D<EC3FFE0103B512E0010F14FC013F14FF90
+B712C048D9C07F7F2703FE000F13F8D807F801037FD80FE06D7F48486D7F48488001F016
+80486C6E13C07F486C6E13E07FA27013F0A56C5AA26C5AEA0FF0EA03C0C914E05EA218C0
+5E1880A24C13005F4C5A4B5B5F4B5B5F4B5B4B90C7FC4B5A5E4B5AED7FE04B5A4A5B4A48
+C8FC4A5A5D4A48EB01F04A5AEC3F804AC7FC02FEEC03E0495A495A495A495AD91F801407
+49C8FC013E150F017FB7FC90B812C05A5A5A5A5A5A5AB9FC1880A4344E79CD43>I<BB12
+FEA5D8000701F8C700077FF0007F191F190785858586861B80A21A1FA31A0FA41BC006F8
+1307A497C7FCA31701A317031707170F177F92B6FCA59238F8007F170F170717031701A3
+1700A795C9FCB3B812F8A54A517CD055>70 D<B700FC017FB600FE91B612F0A5D8003F01
+C0C8001F01E0C9EBF8006F71EE0FC06D7161876F1C1F6D7196C7FC6F8373606D1E3E6F83
+6D7160876F1CFC6D666F4B801F016D66704A806E525A88704A17076E059F5F70021F8008
+0F160F6E6570023F806EDC3E074CC8FC8870027E5F6EDC7C03163E7002FC804F6C167E6E
+1C7C700101814F6C16FC6E745B70010317016E4C6D5D060716C00580496D14036F63DDC0
+0F16E04F6D14076F07F05BDDE01F170F6F92C76C5D1DF8DDF03E6E141F6F98C9FCDDF87E
+16FC067C6E5C6FF1FE3EDDFCFC177E6F4A6E147C1DFFDDFFF06E14FC6F62A24E816F62A2
+70496F5BA24E817061A295C97E7061A270487090CAFCA37048705AA24D1601040360A270
+48705A84537DD18B>87 D<EC7FFF0107B512F0013F14FE90B77E48D9E00F7F2703FE0001
+13F0486C6D7F6EEB3FFC48826E131F83707FA36C496D7FA26C90C7FC6C5AC9FCA6037FB5
+FC020FB6FC91B7FC01071487013FEBF0074913803901FFFC004813F0485B485B485B4890
+C7FC5A5BA2485AA45EA26D5C007F151D163D6C6C02797F6C6D01F113F86C9026C003E1EB
+FFE06C9026F81FC014F06C90B5487EC6ED001F011F01FC010713E0010101E090C8FC3C38
+7CB641>97 D<913803FFC0023F13FC49B6FC010715C04901817F903A3FFC007FF849486D
+7E49486D7E4849130F48496D7E48178048497F18C0488191C7FC4817E0A248815B18F0A2
+12FFA490B8FCA318E049CAFCA6127FA27F7EA218E06CEE01F06E14037E6C6DEC07E0A26C
+6DEC0FC06C6D141F6C6DEC3F806D6CECFF00D91FFEEB03FE903A0FFFC03FF8010390B55A
+010015C0021F49C7FC020113F034387CB63D>101 D<DA3FFF14FF0103B5D8F00713C001
+0FDAFC1F13E0013FECFF7F90267FFC0F9038FF9FF09026FFE001EBF83F48496C13E04849
+90387FF01F4890C7D83FF813E0489338FC0FC0F0078048486E6CC7FCA2003F82A9001F5E
+A26C6C4A5AA26C5E6C6D495A6C6D495A6C6D485BDAFC0F5B4890B6C8FCD803EF14FC01C3
+14F02607C03F90C9FC91CBFCA2120FA37FA213F813FE90B7FC6C16F817FF18C06C836C83
+6C836D828448B9FC12074848C700031480D81FF8EC003F4848150748486F13C083485A83
+A56D5D007F18806D5D003F18006C6C4B5AD80FFEED1FFC6C6C6CEC7FF86C01E049485A6C
+01FE011F5B6C6CB71280010F03FCC7FC010115E0D9000F01FCC8FC3C4F7CB543>103
+D<EB3FF0B5FCA51203C6FCB3A4EE1FFC93B512C0030314F0030F8092391FE07FFC92393F
+001FFE037C8003F07FDAF1E081ECF3C0DAF7807F8502FFC7FC5CA25CA45CB3ACB6D8F807
+B612C0A542537BD24B>I<137F497E000313E0487FA2487FA76C5BA26C5BC613806DC7FC
+90C8FCADEB3FF0B5FCA512017EB3B3A6B612E0A51B547BD325>I<EB3FF0B5FCA512017E
+B3B3B3B1B612F0A51C537BD225>108 D<D93FF0EB1FFCB591B512C0030314F0030F8092
+391FE07FFC92393F001FFE0003027C80C602F07FDAF1E081ECF3C0DAF7807F8502FFC7FC
+5CA25CA45CB3ACB6D8F807B612C0A542367BB54B>110 D<90397FE003FEB590380FFF80
+033F13E04B13F09238FE1FF89139E1F83FFC0003D9E3E013FEC6ECC07FECE78014EF1500
+14EE02FEEB3FFC5CEE1FF8EE0FF04A90C7FCA55CB3AAB612FCA52F367CB537>114
+D<903903FFF00F013FEBFE1F90B7FC120348EB003FD80FF81307D81FE0130148487F4980
+127F90C87EA24881A27FA27F01F091C7FC13FCEBFFC06C13FF15F86C14FF16C06C15F06C
+816C816C81C681013F1580010F15C01300020714E0EC003F030713F015010078EC007F00
+F8153F161F7E160FA27E17E07E6D141F17C07F6DEC3F8001F8EC7F0001FEEB01FE9039FF
+C00FFC6DB55AD8FC1F14E0D8F807148048C601F8C7FC2C387CB635>I<143EA6147EA414
+FEA21301A313031307A2130F131F133F13FF5A000F90B6FCB8FCA426003FFEC8FCB3A9EE
+07C0AB011FEC0F8080A26DEC1F0015806DEBC03E6DEBF0FC6DEBFFF86D6C5B021F5B0203
+13802A4D7ECB34>I E
+%EndDVIPSBitmapFont
+%DVIPSBitmapFont: Fh cmr9 9 26
+/Fh 26 122 df<123C127EB4FCA21380A2127F123D1201A412031300A25A1206120E120C
+121C5A5A126009177A8715>44 D<B512F0A514057F921A>I<123C127E12FFA4127E123C
+08087A8715>I<EB0FE0EB7FFCEBF83E3903E00F803907C007C0EB8003000F14E0391F00
+01F0A24814F8A2003E1300007E14FCA500FE14FEB2007E14FCA56CEB01F8A36C14F0A239
+0F8003E03907C007C0A23903E00F803900F83E00EB7FFCEB0FE01F347DB126>48
+D<EB3FC0EBFFF0000313FC380F80FF391E007F80001CEB3FC048EB1FE048130F15F00060
+130712FC6C14F87E1403A3007E1307123CC7FC15F0A2140F15E0EC1FC0A2EC3F80150014
+7E5C495A5C495A495A495A49C7FC133E133C4913185B485A48481330485A48C7FC001C14
+70001FB512F05A5AB612E0A31D327CB126>50 D<DA03FE130C91393FFF801C91B512E090
+3A03FE01F83C903A0FF0003C7CD91FC0EB0EFCD97F80130701FEC7120348481401000315
+005B4848157C485A173C485A171C123F5B007F160CA390C9FC4893C7FCAA0303B512E07E
+7F92390003FE00705A123F7F121FA26C7E7F12076C7E7F6C6C14036C7E6D6C1307D91FC0
+EB0E7CD90FF0EB1C3CD903FEEBF81C0100B5EAF00C023F01C0C7FCDA03FEC8FC33377CB4
+3C>71 D<EB7F803803FFF0380F80FC381C003E003F133F6D6C7E6E7EA26E7EEA1F00C7FC
+A4EB01FF131FEBFF873803FC07EA0FF0EA1FC0EA3F80127F13004815C05AA3140FA26C13
+1F6C133B3A3F8071F180391FC1E1FF2607FFC013003900FE003C22237DA126>97
+D<EA03F012FFA312071203AEEC3F80ECFFE09038F3C0F89038F7007E01FE7F49EB1F8049
+EB0FC05BED07E016F0A2150316F8AA16F0150716E0A2ED0FC07F6DEB1F8001ECEB3F0001
+CF137C90388381F8903801FFE0C76CC7FC25357EB32B>I<EB07F8EB3FFF9038FC07C039
+01F000E03903E003F03807C007120FEA1F80123F90380003E04890C7FCA2127E12FEAA12
+7FA26C14187F001F14386D1330000F14706C6C13E03903F001C03900FC0F8090383FFE00
+EB07F01D237EA122>I<153FEC0FFFA3EC007F81AEEB07F0EB3FFCEBFC0F3901F003BF39
+07E001FF48487E48487F8148C7FCA25A127E12FEAA127E127FA27E6C6C5BA26C6C5B6C6C
+4813803A03F007BFFC3900F81E3FEB3FFCD90FE0130026357DB32B>I<EB0FE0EB7FFCEB
+F83F3903F00F80D807E013C0390FC007E0381F800315F0EA3F0014014814F8127EA212FE
+A2B6FCA248C8FCA5127E127FA26C1418A26C6C1338000F14306D13706C6C13E03901F003
+C03900FC0F00EB3FFEEB07F01D237EA122>I<EB01FCEB07FF90381F078090383E0FC0EB
+7C1F13FCEA01F8A20003EB070049C7FCACB512F0A3D803F0C7FCB3A7487E387FFFE0A31A
+357FB417>I<151F90391FC07F809039FFF8E3C03901F07FC73907E03F033A0FC01F8380
+9039800F8000001F80EB00074880A66C5CEB800F000F5CEBC01F6C6C48C7FCEBF07C380E
+FFF8380C1FC0001CC9FCA3121EA2121F380FFFFEECFFC06C14F06C14FC4880381F000100
+3EEB007F4880ED1F8048140FA56C141F007C15006C143E6C5C390FC001F83903F007E0C6
+B51280D91FFCC7FC22337EA126>I<EA03F012FFA312071203AEEC1FC0EC7FF09038F1E0
+FC9038F3807C9038F7007E13FE497FA25BA25BB3486CEB7F80B538C7FFFCA326347EB32B
+>I<EA0780EA0FC0EA1FE0A4EA0FC0EA0780C7FCAAEA07E012FFA3120F1207B3A6EA0FF0
+B5FCA310337EB215>I<EA07E012FFA3120F1207B3B3A7EA0FF0B5FCA310347EB315>108
+D<2703F01FE013FF00FF90267FF80313C0903BF1E07C0F03E0903BF3803E1C01F02807F7
+003F387FD803FE1470496D486C7EA2495CA2495CB3486C496C487EB53BC7FFFE3FFFF0A3
+3C217EA041>I<3903F01FC000FFEB7FF09038F1E0FC9038F3807C3907F7007EEA03FE49
+7FA25BA25BB3486CEB7F80B538C7FFFCA326217EA02B>I<EB07F0EB3FFE9038FC1F8039
+01F007C03903C001E000078048486C7E48C7127CA248147E003E143E007E143FA300FE15
+80A8007E1500A36C147EA26C147C6D13FC6C6C485A00075C3903F007E03900FC1F80D93F
+FEC7FCEB07F021237EA126>I<3903F03F8000FFEBFFE09038F3C0F89038F7007ED807FE
+7F6C48EB1F804914C049130F16E0ED07F0A3ED03F8A9150716F0A216E0150F16C06D131F
+6DEB3F80160001FF13FC9038F381F89038F1FFE0D9F07FC7FC91C8FCAA487EB512C0A325
+307EA02B>I<3803E07C38FFE1FF9038E38F809038E71FC0EA07EEEA03ECA29038FC0F80
+49C7FCA35BB2487EB512E0A31A217FA01E>114 D<EBFF06000713CE381F00FE003C133E
+48131E140E5A1406A27EA200FE90C7FC6C7EEA7FFC383FFFC014F0000F7F6C7FC67FEB0F
+FF1300EC3F8000C0131F140F6C1307A37E15006C5B6C130E6C5B38F7807838E1FFE038C0
+7F8019237EA11E>I<1330A51370A313F0A21201A212031207381FFFFEB5FCA23803F000
+AF1403A814073801F806A23800FC0EEB7E1CEB1FF8EB07E0182F7FAD1E>I<B5EBFFF0A3
+D80FF0EB3F800007EC1F000003140E150C6D131C00011418A26C6C5BA26D1370017E1360
+137F6D5BA290381F8180A214C3010F90C7FCA2EB07E6A214FE6D5AA26D5AA36D5AA21460
+24217E9F29>118 D<B53A1FFF81FFF0A33C07F801FC003F8001F049EB1E000003010014
+1C816C6C017C1318A26D017E1338000002FE1330A290267E01FF5B159F168090263F030F
+5BA216C0903A1F8607C180A202C613E390260FCC0390C7FCA2D907FC13F6ECF80116FE6D
+486C5AA36D481378A36D48133034217F9F37>I<3A7FFF807FF8A33A07F8001FC00003EC
+0F800001EC070015066C6C5BA26D131C017E1318A26D5BA2EC8070011F1360ECC0E0010F
+5BA2903807E180A214F3010390C7FC14FBEB01FEA26D5AA31478A21430A25CA214E05CA2
+495A1278D8FC03C8FCA21306130EEA701CEA7838EA1FF0EA0FC025307F9F29>121
+D E
+%EndDVIPSBitmapFont
+%DVIPSBitmapFont: Fi cmbx9 9 7
+/Fi 7 117 df<ED1F80A24B7EA24B7EA34B7EA24A7FA34A7FA24A7F15CFA2020F7F1587
+021F801503023F80EC3E01A2027E80EC7C0002FC804A137FA20101814A133F0103814A13
+1FA249B67EA24981A290271F8000077F91C77EA24982013E80017E82017C80A201FC8249
+157FB500F0013FB512F0A43C347DB343>65 D<EB7FFE0003B512E04814F8390FF00FFC39
+1FF803FF806E138016C0157F6C5A6C5AEA0180C8FCEC7FFF010FB5FC90B6FC0003EBF07F
+000F1300EA1FF8485A485A485A5BA315FF7F007F5B6D4813E03A3FF80FBFFF000FB5121F
+0003EBFC0F39007FE00728217EA02B>97 D<EA01FC12FFA4120F1207ADEC07FC91387FFF
+8001FDB512E09039FFF00FF89138C007FC91380003FE496D7E496D1380A217C0167FA217
+E0A917C0A216FF1780A26D4913006D495A9138C007FC9039F3F01FF801E1B512E0D9C07F
+13809026800FF8C7FC2B347EB331>I<903807FF80013F13F090B512FC3903FE01FE4848
+487EEA0FF8EA1FF0EA3FE0A2007F6D5A496C5A153000FF91C7FCA9127F7FA2003FEC0780
+7F6C6C130F000FEC1F00D807FE133E3903FF80FCC6EBFFF8013F13E0010790C7FC21217D
+A027>I<3901F81F8000FFEB7FF0ECFFF89038F9E3FC9038FBC7FE380FFF876C1307A213
+FEEC03FCEC01F8EC0060491300B1B512F0A41F217EA024>114 D<9038FFE1C0000713FF
+5A383F803F387E000F14075A14037EA26C6CC7FC13FCEBFFE06C13FC806CEBFF80000F14
+C06C14E0C6FC010F13F0EB007F140F00F0130714037EA26C14E06C13076CEB0FC09038C0
+1F8090B5120000F913FC38E03FE01C217DA023>I<133CA5137CA313FCA21201A2120312
+07001FB51280B6FCA3D807FCC7FCB0EC03C0A79038FE078012033901FF0F006C13FEEB3F
+FCEB0FF01A2F7EAE22>I E
+%EndDVIPSBitmapFont
+%DVIPSBitmapFont: Fj cmr12 12 19
+/Fj 19 123 df<121EEA7F8012FF13C0A213E0A3127FEA1E601200A413E013C0A3120113
+80120313005A1206120E5A5A5A12600B1D78891B>44 D<121EEA7F80A2EAFFC0A4EA7F80
+A2EA1E000A0A78891B>46 D<14FF010713E090381F81F890383E007C01FC133F4848EB1F
+8049130F4848EB07C04848EB03E0A2000F15F0491301001F15F8A2003F15FCA390C8FC48
+15FEA54815FFB3A46C15FEA56D1301003F15FCA3001F15F8A26C6CEB03F0A36C6CEB07E0
+000315C06D130F6C6CEB1F806C6CEB3F00013E137C90381F81F8903807FFE0010090C7FC
+28447CC131>48 D<143014F013011303131F13FFB5FC13E713071200B3B3B0497E497E00
+7FB6FCA3204278C131>I<EB03FE90381FFFC0017F13F03901F80FFC3903C001FE48486C
+7E000EC7EA7F8048EC3FC0ED1FE04815F00030140F007015F800601407126CB415FC7F7F
+1503A46C4813076CC7FCC8FC16F8A2150F16F0151F16E0A2ED3FC0ED7F8016005D5D4A5A
+4A5A4A5A5D4A5A4A5A4AC7FC147C5C5C495A495A495A49C7120C131E5B013814185B5B48
+5A4848143848C81230000E1570001FB612F0A25A5AB712E0A326427BC131>I<121CA2EA
+1F8090B712C0A3481680A217005E0038C8120C0030151C00705D0060153016705E5E4814
+014B5A4BC7FCC81206150E5D151815385D156015E04A5AA24A5A140792C8FC5CA25C141E
+143EA2147E147CA214FCA21301A3495AA41307A6130FAA6D5AEB01C02A457BC231>55
+D<B8FC17E017FC00019039C00003FF6C6C4801007FEF3FC0717E717E717E841703841701
+84A760A21703601707604D5A4D5AEF7FC04DC7FCEE03FEEE3FF091B65A17FC0280C7B47E
+EF1FC0EF0FF0717E717E717E717E1980187F19C0A2183F19E0A8F07FC0A2198018FF4D13
+00A24D5AEF0FFC4D5AEF7FE048486C903803FFC0B9C7FC17FC17C03B447CC345>66
+D<010FB512FEA3D9000313806E130080B3B3AB123F487E487EA44A5A13801300006C495A
+00705C6C13076C5C6C495A6CEB1F802603E07FC7FC3800FFFCEB1FE027467BC332>74
+D<B712FCEEFFC017F800019039C0000FFC6C6C48EB01FF9338007F80EF1FE0170FEF07F0
+18F8EF03FCA218FE1701A218FFA718FEA2170318FCA2EF07F818F0EF0FE0EF1FC0EF7F80
+933801FE00EE0FFC91B612F017800280C9FCB3AA3801FFE0B612C0A338447CC342>80
+D<B60107B500F890380FFFFEA3000301E0D9001F90C813F06C0180DA0FFCED3FC091C86C
+48ED1F006C871C0E6D6C6E7E1C0CA26D6C6F5DA36EDA06FF1538011F1A30A26E020E6D14
+70010FDB0C7F1560A26E021C7F0107DB183F5DA2856D6CDA301F4A5AA36D6C4A6C6C49C7
+FCA36D6C4A6C6C1306A3DB80016E130E027FDA8003140CA2DBC00380023FDA00015CA203
+E081021F01066D5CA36E6C486E6C5AA36E6C486E6C5AA36F48EC1FE1020360A2DBFE7015
+F302010160020F90C8FCA2DBFFE015FB6E49EC07FEA36F486E5AA36FC86C5AA3031E6F5A
+A4030C16605F467EC364>87 D<EB07FC90383FFF809038F80FE03903C003F048C66C7E00
+0E6D7ED80FC0137E486C137F6D6D7EA36F7EA26C5AEA0380C8FCA4EC0FFF49B5FC90380F
+FE1FEB3FC0EBFF00EA03FC485A485A485A485A127F5B176048C7FCA3153FA36D137F007F
+14EF6D9038C7E0C0003F13013A1FE00783F13B07F81E03FF802701FFFC0113003A001FE0
+007C2B2E7CAC31>97 D<EB01FE903807FFC090381F03F090387E00FC49137E48487F485A
+4848EB1F80000F15C049130F121F484814E01507A2007F15F090C7FCA25AA390B6FCA290
+C9FCA67EA27FA2123F16306C7E1670000F15606D14E06C6C14C0000314016C6CEB03806C
+6CEB0700013E131E90381F80F8903803FFE0010090C7FC242E7DAC2B>101
+D<EA01E0EA07F8A2487EA46C5AA2EA01E0C8FCADEA01FC12FFA3120712031201B3B0487E
+B512F8A315437DC21C>105 D<EA01FC12FFA3120712031201B3B3B3A5487EB512F8A315
+457DC41C>108 D<3901FC01FE00FF903807FFC091381E07F091383801F8000701707F00
+03EBE0002601FDC07F5C01FF147F91C7FCA25BA35BB3A8486CECFF80B5D8F83F13FEA32F
+2C7DAB36>110 D<3903F803F000FFEB1FFCEC3C3EEC707F0007EBE0FF3803F9C000015B
+13FBEC007E153C01FF13005BA45BB3A748B4FCB512FEA3202C7DAB26>114
+D<D801FC147F00FFEC3FFFA300071401000380000181B3A85EA35DA212006D5B017E9038
+077F80017F010E13C06D011C13FE90380FC078903803FFF09026007F8013002F2D7DAB36
+>117 D<B539F001FFFCA3000790C7EA7FE06C48EC1F8000011600160E0000150C6D141C
+6D1418A26E1338013F1430A26D6C5BA26E13E0010F5CA26D6C485AA2ECF803010391C7FC
+A2903801FC06A2ECFE0E0100130CA2EC7F18A215B8EC3FB0A2EC1FE0A36E5AA26E5AA36E
+C8FCA21406A35CA25CA2123C007E5BB4FC5CA25CEAFE01387C0380D87007C9FCEA3C1EEA
+0FFCEA03F02E3F7EAA33>121 D<003FB612E0A29038C0003F90C713C0003CEC7F800038
+ECFF00A20030495A0070495AA24A5A0060495AA24A5A4A5AA2C7485A4AC7FC5B5C495A13
+075C495A131F4A1360495A495AA249C712C0485AA2485A485A1501485A48481303A24848
+EB07804848131F00FF14FF90B6FCA2232B7DAA2B>I E
+%EndDVIPSBitmapFont
+%DVIPSBitmapFont: Fk cmr17 17.28 14
+/Fk 14 117 df<170FA34D7EA24D7EA34D7EA34D7EA34C7F17DFA29338039FFC178FA293
+38070FFE1707040F7FEE0E03A2041E80EE1C01A2043C80EE3800A24C80187FA24C80183F
+A24B4880181F0303814C130FA203078193C71207A24B81030E80A24B8284A24B8284A24B
+82197F03F0824B153FA20201834B151FA202038392B8FCA24A83A292C91207020E8385A2
+4A8485023C84023882A20278840270177FA202F0844A173FA24948841A1FA24948841A0F
+A249CB7F1A074985865B496C85497E48486C4D7F000F01F8051F13F0B60407B612F0A45C
+657DE463>65 D<BB12FCA4C601FCC8120FD93FF89238007FFE011F171F190719031900A2
+1A7E1A3EA21A1EA21A1F86A486A6F20380A318E0A297C7FCA61701A417031707170F171F
+17FF91B7FCA402F8C7FC171F170F170717031701A41700A895C9FCB3A580133F90B57EB7
+12E0A4496279E156>70 D<B500FC041FB512F0A280A226003FFF0400EBFE006D6DEE3FF8
+F20FE0011D7F745A011C7F6E6C705AA26E7E81141F6E7EA26E7E82806E7FA26E7F6F7EA2
+6F7E82151F6F7EA26F7E83816F7FA26F7F707EA2707E83161F707EA2707E8482707FA270
+7F84177F717E84171F717EA2717E1980837113C0A27113E019F0187FF03FF819FC181FF0
+0FFEA2F007FF1A83847213C3A27213E31AF3197FF13FFB1AFF8585A285A28585A285133E
+1A7F017F183FA22601FFC0171F000701F0170FB67E1A07A21A03546279E163>78
+D<B812FCEFFFE018FCF0FF80C601FCC7000F13E0D93FF89138007FF8011FEE1FFCF007FF
+06017F727FF13FE0737E86737E737EA2868587A28587A96361A298C8FC6162624F5A191F
+4F5A4F5AF1FF804E90C9FCF007FEF01FF8F0FFE0050F138091B700FCCAFC18E08402F8C7
+EA1FFE943801FF80716C7EF03FF0727EF007FC727E85727F8486737EA3737EAA86AA1DE0
+86191FA3070F14017414C007071403496C8390B570EC0780B76F9038800F00736D5A9738
+3FF03E97380FFFFCCD000313F09738003FC05B6479E162>82 D<EC3FF0903803FFFE010F
+6D7E90393FC03FE090397E0007F801F86D7ED801E06D7E48486D7E48486E7E48C86C7E7F
+01F06E7E487E6D6E7EA3707EA36C5AEA03E0C9FCA6167FED7FFF020FB5FC91387FF80790
+3801FF80903807FC00EB1FF0EB7FC0495AD803FEC7FC485A120F5B485A485AA2484817E0
+A312FF5BA2160FA3161F6D141B007F153B16736D913971FC01C06C6C14E1001FEC01C1D8
+0FFC903A0780FE03806C6C903A0F00FF07002701FF807E6DB4FC27007FFFF86D5A011F01
+E0EB1FF8010190C7EA07E03B417ABF42>97 D<EC03FE91381FFFE091B512F8903901FE03
+FE903A07F0007F8049486D7ED93FC06D7E49C76C7E496E7E491403484881484814010007
+82491400000F8283485A1880123F49153FA2007F17C0A35BA212FF90B8FCA30180CAFCA9
+127F7FA3123FA27F121FEF01C06C7E17036C6C1680A26C6C15070001EE0F006D150E6C6C
+151E6D6C5C6D6C5C6D6C5CD907F0EB03E0D903FC495A902700FF803FC7FC91383FFFFC02
+0F13F00201138032417CBF3A>101 D<F03F80DA03FC903801FFE091273FFFC00713F091
+B539F01FC1F8903B03FC03FC3E03903A07F000FE784948EB7FE04948EB3FC04948011FEB
+01F049C76C6CC7FC01FE6E7EA248486E7EA2000382A2491401000782AA00035E6D1403A2
+00015EA26C6C4A5AA2017F4A5A6D6C495A6D6C495A496C49C8FCD937F013FE903973FC03
+FC0160B512F0D9E03F13C0DA03FCC9FC4848CBFCA57FA27FA27F6C7E13FF91B512FE6DEC
+FFF06D15FE6D6F7E6D16E084013F16FC01FEC700017FD803F8EC001FD807E0ED03FF4848
+030013804848167F003FEF3FC090CA121F127EF00FE012FE481707A66C170F007E18C0A2
+007F171F6C6CEE3F806C6CEE7F00000F177ED807F04B5A6C6C4B5A6C6C4B5AD8007FED1F
+C0D93FE0ECFF80D90FFED90FFEC7FC0101B612F0D9003F1480020101F0C8FC3D5E7DBF42
+>103 D<133C13FF487F487FA66C5B6C90C7FC133C90C8FCB3A2EB03C0EA07FF127FA412
+01EA007FA2133FB3B3AC497E497EB612E0A41B5F7DDE23>105 D<EB03C0EA07FFB5FCA4
+1201EA007FA2133FB3B3B3B3AD497E497EB612F0A41C647DE323>108
+D<D903C0D9FFC0EC07FED807FF010301F891381FFFC0B5010F01FE027F13F0923D3F00FF
+8001F807FC0378903B3FC003C001FEDAC1E090261FE00FC77E0001D9C3C090260FF01E6E
+7ED8007F49902607F81C6E7E02C7C75CD93FCE6E6C486E7E02CC166002DC16E002D85E02
+F8DA01FF6F7E4A5EA24A93C8FCA44A5DB3B3496C4A6C4B7E496C4A6D4A7EB6D8F007B6D8
+803FB512FCA4663F7CBE6F>I<D903C0EB7FE0D807FF903803FFFCB5010F13FFDB3F0013
+C00378EB1FE04B6D7E0001D9C1C06D7E27007FC3808002C7C71203D93FCE81170114DC14
+D802F86E7E5CA35CA35CB3B3496C4A7F496C4A7FB6D8F003B612C0A4423F7DBE49>I<90
+39078003F8D807FFEB0FFFB5013F13C092387C0FE0913881F01F9238E03FF00001EB8380
+39007F8700148FEB3F8E029CEB1FE0EE0FC00298EB030002B890C7FCA214B014F0A25CA5
+5CB3B0497EEBFFF8B612FCA42C3F7CBE33>114 D<9139FFE00180010FEBFC03017FEBFF
+073A01FF001FCFD803F8EB03EFD807E0EB01FF48487F4848147F48C8123F003E151F007E
+150F127CA200FC1507A316037EA27E7F6C7E6D91C7FC13F8EA3FFE381FFFF06CEBFF806C
+14F86C14FF6C15C06C6C14F0011F80010714FED9007F7F02031480DA003F13C015030300
+13E0167F00E0ED1FF0160F17F86C15071603A36C1501A37EA26C16F016037E17E06D1407
+6DEC0FC06D1580D8FDF0141FD8F8F8EC7F00013E14FC3AF01FC00FF80107B512E0D8E001
+148027C0003FF8C7FC2D417DBF34>I<1438A71478A414F8A31301A31303A21307130F13
+1FA2137F13FF1203000F90B6FCB8FCA3260007F8C8FCB3AE17E0AE6D6CEB01C0A316036D
+6C148016076D6C14006E6C5A91383FC01E91381FF07C6EB45A020313E09138007F802B59
+7FD733>I E
+%EndDVIPSBitmapFont
+end
+%%EndProlog
+%%BeginSetup
+%%Feature: *Resolution 600dpi
+TeXDict begin
+%%PaperSize: A4
+
+%%EndSetup
+%%Page: 1 1
+1 0 bop 1065 872 a Fk(Filtering)46 b(mRNA)e(signal)1375
+1166 y Fj(Brian)31 b(P)-8 b(.)33 b(W)-8 b(alenz)1359
+1414 y(Jan)m(uary)33 b(17,)f(2002)1554 1630 y Fi(Abstract)613
+1770 y Fh(Giv)n(en)24 b(signals)h(detected)e(b)n(y)g(c)n(haining)h
+(20-mers,)g(a)g(metho)r(d)e(is)i(presen)n(ted)f(for)498
+1861 y(deciding)j(whic)n(h)g(signals)h(p)r(oten)n(tially)f(con)n(tain)g
+(real)h(matc)n(hes.)291 2198 y Fg(1)134 b(What)45 b(is)h(a)f(signal)291
+2436 y Ff(Signal)35 b(has)h(three)g(v)-5 b(alues)35 b(asso)r(ciated)g
+(with)i(it.)63 b(The)36 b('co)n(v)n(ered',)h(the)f('matc)n(hed')g(and)
+291 2535 y(the)28 b('length'.)37 b(Co)n(v)n(ered)26 b(is)h(the)h(n)n
+(um)n(b)r(er)g(of)f(bases)g(in)h(the)g(mRNA)h(that)f(are)e(co)n(v)n
+(ered)g(b)n(y)291 2635 y(at)34 b(least)g(one)g(mer.)56
+b(Matc)n(hed)34 b(is)g(the)h(n)n(um)n(b)r(er)f(of)g(bases)g(matc)n(hed)
+g(\(n)n(um)n(b)r(er)g(of)g(mers)291 2734 y(*)29 b(size)h(of)g(a)g
+(mer\),)h(and)f(the)h(length)f(is)g(the)h(n)n(um)n(b)r(er)f(of)g(mers)g
+(in)g(the)h(mRNA)g(\(roughly)291 2834 y(equiv)-5 b(alen)n(t)22
+b(to)h(the)g(n)n(um)n(b)r(er)g(of)g(bases)f(in)h(the)h(mRNA)g(that)f
+(could)g(b)r(e)g(co)n(v)n(ered)e(b)n(y)i(a)f(mer,)291
+2934 y(but)28 b(easier)e(to)i(compute\).)291 3087 y(F)-7
+b(rom)27 b(these,)g(w)n(e)h(score)e(signals)g(using)i(t)n(w)n(o)e(v)-5
+b(alues,)28 b(co)n(v)n(erage)c(and)k(m)n(ultiplicit)n(y)-7
+b(.)291 3240 y(Co)n(v)n(erage)30 b(is)j(de\014ned)h(as)f(co)n(v)n(ered)
+e(/)i(length.)54 b(Multiplicit)n(y)34 b(is)g(de\014ned)f(as)g(matc)n
+(hed)g(/)291 3340 y(co)n(v)n(ered.)291 3677 y Fg(2)134
+b(Filter)291 3914 y Ff(Six)26 b(parameters)f(are)g(used:)37
+b(-l,)26 b(-h,)h(-v,)f(-m,)h(-mc)f(and)g(-ml,)h(corresp)r(onding)e(to)h
+Fe(L)p Ff(,)g Fe(H)7 b Ff(,)27 b Fe(V)19 b Ff(,)291 4014
+y Fe(M)9 b Ff(,)27 b Fe(M)512 4026 y Fd(c)545 4014 y
+Ff(,)h(and)f Fe(M)838 4026 y Fd(l)863 4014 y Ff(.)291
+4167 y(Default)h(v)-5 b(alues:)1606 4354 y Fe(L)22 b
+Ff(=)h(0)p Fe(:)p Ff(2)1587 4478 y Fe(H)29 b Ff(=)23
+b(0)p Fe(:)p Ff(6)1595 4603 y Fe(V)42 b Ff(=)23 b(0)p
+Fe(:)p Ff(7)1573 4727 y Fe(M)31 b Ff(=)23 b(0)p Fe(:)p
+Ff(3)1548 4852 y Fe(M)1629 4864 y Fd(c)1685 4852 y Ff(=)g(0)p
+Fe(:)p Ff(2)1556 4976 y Fe(M)1637 4988 y Fd(l)1685 4976
+y Ff(=)g(150)1702 5255 y(1)p eop
+%%Page: 2 2
+2 1 bop 739 203 a Ff(Jan)n(uary)26 b(17,)g(2002)396 b(Filtering)27
+b(mRNA)h(signal)524 b(Brian)26 b(W)-7 b(alenz)p 739 236
+2865 4 v 821 1849 a @beginspecial 0 @llx 0 @lly 433 @urx
+226 @ury 3240 @rwi @setspecial
+%%BeginDocument: mRNAfilt.eps
+%!PS-Adobe-2.0 EPSF-2.0
+%%Title: mRNAfilt.eps
+%%Creator: fig2dev Version 3.2 Patchlevel 0-beta3
+%%CreationDate: Thu Jan 17 16:15:18 2002
+%%For: walenz at dsc154p.celera.com (Brian Walenz,3604)
+%%Orientation: Portrait
+%%BoundingBox: 0 0 433 226
+%%Pages: 0
+%%BeginSetup
+%%EndSetup
+%%Magnification: 1.0000
+%%EndComments
+/$F2psDict 200 dict def
+$F2psDict begin
+$F2psDict /mtrx matrix put
+/col-1 {0 setgray} bind def
+/col0 {0.000 0.000 0.000 srgb} bind def
+/col1 {0.000 0.000 1.000 srgb} bind def
+/col2 {0.000 1.000 0.000 srgb} bind def
+/col3 {0.000 1.000 1.000 srgb} bind def
+/col4 {1.000 0.000 0.000 srgb} bind def
+/col5 {1.000 0.000 1.000 srgb} bind def
+/col6 {1.000 1.000 0.000 srgb} bind def
+/col7 {1.000 1.000 1.000 srgb} bind def
+/col8 {0.000 0.000 0.560 srgb} bind def
+/col9 {0.000 0.000 0.690 srgb} bind def
+/col10 {0.000 0.000 0.820 srgb} bind def
+/col11 {0.530 0.810 1.000 srgb} bind def
+/col12 {0.000 0.560 0.000 srgb} bind def
+/col13 {0.000 0.690 0.000 srgb} bind def
+/col14 {0.000 0.820 0.000 srgb} bind def
+/col15 {0.000 0.560 0.560 srgb} bind def
+/col16 {0.000 0.690 0.690 srgb} bind def
+/col17 {0.000 0.820 0.820 srgb} bind def
+/col18 {0.560 0.000 0.000 srgb} bind def
+/col19 {0.690 0.000 0.000 srgb} bind def
+/col20 {0.820 0.000 0.000 srgb} bind def
+/col21 {0.560 0.000 0.560 srgb} bind def
+/col22 {0.690 0.000 0.690 srgb} bind def
+/col23 {0.820 0.000 0.820 srgb} bind def
+/col24 {0.500 0.190 0.000 srgb} bind def
+/col25 {0.630 0.250 0.000 srgb} bind def
+/col26 {0.750 0.380 0.000 srgb} bind def
+/col27 {1.000 0.500 0.500 srgb} bind def
+/col28 {1.000 0.630 0.630 srgb} bind def
+/col29 {1.000 0.750 0.750 srgb} bind def
+/col30 {1.000 0.880 0.880 srgb} bind def
+/col31 {1.000 0.840 0.000 srgb} bind def
+
+end
+save
+-36.0 387.0 translate
+1 -1 scale
+
+/cp {closepath} bind def
+/ef {eofill} bind def
+/gr {grestore} bind def
+/gs {gsave} bind def
+/sa {save} bind def
+/rs {restore} bind def
+/l {lineto} bind def
+/m {moveto} bind def
+/rm {rmoveto} bind def
+/n {newpath} bind def
+/s {stroke} bind def
+/sh {show} bind def
+/slc {setlinecap} bind def
+/slj {setlinejoin} bind def
+/slw {setlinewidth} bind def
+/srgb {setrgbcolor} bind def
+/rot {rotate} bind def
+/sc {scale} bind def
+/sd {setdash} bind def
+/ff {findfont} bind def
+/sf {setfont} bind def
+/scf {scalefont} bind def
+/sw {stringwidth} bind def
+/tr {translate} bind def
+/tnt {dup dup currentrgbcolor
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add
+ 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb}
+ bind def
+/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
+ 4 -2 roll mul srgb} bind def
+/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
+/$F2psEnd {$F2psEnteredState restore end} def
+%%EndProlog
+
+$F2psBegin
+10 setmiterlimit
+n -1000 7450 m -1000 -1000 l 8812 -1000 l 8812 7450 l cp clip
+ 0.06000 0.06000 sc
+% Polyline
+7.500 slw
+n 7800 6000 m 900 6000 l gs col0 s gr
+% Polyline
+n 1800 6300 m 1800 6000 l gs col0 s gr
+% Polyline
+n 2400 6300 m 2400 6000 l gs col0 s gr
+% Polyline
+n 3600 6300 m 3600 6000 l gs col0 s gr
+% Polyline
+n 4200 6300 m 4200 6000 l gs col0 s gr
+% Polyline
+n 4800 6300 m 4800 6000 l gs col0 s gr
+% Polyline
+n 3000 6300 m 3000 6000 l gs col0 s gr
+% Polyline
+n 5400 6300 m 5400 6000 l gs col0 s gr
+% Polyline
+n 6000 6300 m 6000 6000 l gs col0 s gr
+% Polyline
+n 6600 6300 m 6600 6000 l gs col0 s gr
+% Polyline
+n 7200 6300 m 7200 6000 l gs col0 s gr
+% Polyline
+n 900 5400 m 1200 5400 l gs col0 s gr
+% Polyline
+n 900 4800 m 1200 4800 l gs col0 s gr
+% Polyline
+n 900 4200 m 1200 4200 l gs col0 s gr
+% Polyline
+n 900 3600 m 1200 3600 l gs col0 s gr
+% Polyline
+n 900 3000 m 1200 3000 l gs col0 s gr
+% Polyline
+30.000 slw
+n 1200 5400 m 2400 5400 l 4800 3600 l 7200 3600 l gs col0 s gr
+% Polyline
+7.500 slw
+ [15 60] 60 sd
+n 1200 3600 m 7800 3600 l gs col0 s gr [] 0 sd
+% Polyline
+ [15 60] 60 sd
+n 2400 2700 m 2400 5700 l gs col0 s gr [] 0 sd
+% Polyline
+n 1200 2700 m 1200 6300 l gs col0 s gr
+% Polyline
+ [15 60] 60 sd
+n 4800 2700 m 4800 5700 l gs col0 s gr [] 0 sd
+/Times-Roman ff 180.00 scf sf
+6000 3525 m
+gs 1 -1 sc (minL) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+600 5475 m
+gs 1 -1 sc (0.0) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+600 4875 m
+gs 1 -1 sc (0.1) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+600 4275 m
+gs 1 -1 sc (0.2) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+600 3675 m
+gs 1 -1 sc (0.3) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+600 3075 m
+gs 1 -1 sc (0.4) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1125 6450 m
+gs 1 -1 sc (0.0) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1725 6450 m
+gs 1 -1 sc (0.1) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2325 6450 m
+gs 1 -1 sc (0.2) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2925 6450 m
+gs 1 -1 sc (0.3) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+3525 6450 m
+gs 1 -1 sc (0.4) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+4125 6450 m
+gs 1 -1 sc (0.5) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+4725 6450 m
+gs 1 -1 sc (0.6) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+5325 6450 m
+gs 1 -1 sc (0.7) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+5925 6450 m
+gs 1 -1 sc (0.8) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+6525 6450 m
+gs 1 -1 sc (0.9) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+7125 6450 m
+gs 1 -1 sc (1.0) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+1425 3525 m
+gs 1 -1 sc (V) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+2250 2850 m
+gs 1 -1 sc (L) col0 sh gr
+/Times-Roman ff 180.00 scf sf
+4650 2850 m
+gs 1 -1 sc (H) col0 sh gr
+$F2psEnd
+rs
+
+%%EndDocument
+ @endspecial 1593 2032 a(Figure)27 b(1:)37 b(The)27 b
+Fe(p)h Ff(curv)n(e)e(\(mostly\).)739 2302 y(Signals)h(are)f(\014ltered)
+i(b)n(y)f(considering)f(all)i(signals)e(for)h(a)g(single)h(mRNA.)1369
+2634 y Fe(h)23 b Ff(=)g Fe(bestC)6 b(ov)s(er)r(ag)s(e)17
+b Fc(\000)h Fe(w)r(or)r(stC)6 b(ov)s(er)r(ag)s(e)1375
+2897 y(p)23 b Ff(=)1528 2702 y Fb(8)1528 2777 y(>)1528
+2802 y(<)1528 2951 y(>)1528 2976 y(:)1601 2781 y Ff(1)p
+Fe(:)p Ff(0)778 b(if)28 b Fe(h)22 b Fc(\024)h Fe(L)1601
+2900 y Ff(1)p Fe(:)p Ff(0)18 b Fc(\000)g Ff(\(1)p Fe(:)p
+Ff(0)g Fc(\000)g Fe(V)h Ff(\))f Fc(\003)2246 2868 y Fd(h)p
+Fa(\000)p Fd(L)p 2236 2882 157 4 v 2236 2929 a(H)t Fa(\000)p
+Fd(L)2486 2900 y Ff(if)28 b Fe(L)22 b(<)h(h)g(<)f(H)1601
+3020 y(V)837 b Ff(if)28 b Fe(H)h Fc(\024)23 b Fe(h)739
+3259 y(cutL)f Ff(=)h Fe(min)p Ff(\()p Fe(bestC)6 b(ov)s(er)r(ag)s(e)17
+b Fc(\000)h Fe(p)g Fc(\001)h Fe(h;)14 b(M)9 b Ff(\))739
+3408 y(cutL)37 b(is)g(the)g(minim)n(um)g(co)n(v)n(erage)d(that)j(will)g
+(b)r(e)g(accepted.)65 b(It)37 b(is)f(deriv)n(ed)g(from)h(the)739
+3508 y(range)26 b(of)i(scores,)e(not)h(the)h(n)n(um)n(b)r(er)g(of)f
+(scores.)739 3658 y(If)i(the)g(score)f(range)f(is)i(small)f(\(<=)h
+(L\),)g(then)g(cutL)g(will)g(b)r(e)h(w)n(orstCo)n(v)n(erage,)25
+b(and)j(w)n(e)h(do)739 3757 y(no)f(\014ltering.)37 b(If)28
+b(the)h(score)d(range)h(is)g(large)g(\(>=)h(H\),)h(then)f(cutL)g(will)g
+(b)r(e)h(M)f(of)g(the)g(b)r(est)739 3857 y(score.)739
+4006 y(A)g(signal)f(is)g(sa)n(v)n(ed)f(if)i(t)n(w)n(o)f(conditions)g
+(are)g(met:)840 4156 y(1.)41 b(\()p Fe(cutL)23 b(<)p
+Ff(=)f Fe(cov)s(er)r(ag)s(e)p Ff(\))840 4305 y(2.)41
+b(\()p Fe(M)1059 4317 y Fd(c)1116 4305 y Fe(<)p Ff(=)22
+b Fe(cov)s(er)r(ag)s(e)p Ff(\))28 b(or)f(\()p Fe(M)1866
+4317 y Fd(l)1914 4305 y Fe(<)p Ff(=)c Fe(cov)s(er)r(edB)t(ases)p
+Ff(\))2150 5255 y(2)p eop
+%%Trailer
+end
+userdict /end-hook known{end-hook}if
+%%EOF
diff --git a/ESTmapper LaTeX/mrna-filter.tex b/ESTmapper LaTeX/mrna-filter.tex
new file mode 100644
index 0000000..b078d8f
--- /dev/null
+++ b/ESTmapper LaTeX/mrna-filter.tex
@@ -0,0 +1,164 @@
+\documentclass[twoside]{article}
+\usepackage{amsmath,amssymb}
+\usepackage{moreverb}
+\usepackage{fancyheadings}
+\usepackage{ulem}
+\usepackage{parskip}
+\usepackage{calc,ifthen,epsfig}
+\sloppy
+
+\begin{document}
+
+% See page 63-64, LaTeX Companion
+%
+% leftmargin controls the left margin for EVERYTHING in the list!
+%
+\newcommand{\entrylabel}[1]{\mbox{\texttt{#1:}}\hfil}
+\newenvironment{entry}
+ {\begin{list}{}%
+ {\renewcommand{\makelabel}{\entrylabel}%
+ %\setlength{\leftmargin}{1.5in}%
+ }}
+{\end{list}}
+
+% The first parbox width controls the indent on the first text line
+% The makebox width seems to do nothing.
+\newcommand{\Lentrylabel}[1]{%
+ {\parbox[b]{0pt}{\makebox[0pt][l]{\texttt{#1:}}\\}}\hfil\relax}
+\newenvironment{Lentry}
+ {\renewcommand{\entrylabel}{\Lentrylabel}\begin{entry}}
+ {\end{entry}}
+
+\title{Filtering mRNA signal}
+\author{Brian P. Walenz}
+\maketitle
+
+\pagestyle{fancy}
+
+\rhead[Brian Walenz]{Brian Walenz}
+\chead[Filtering mRNA signal]{Filtering mRNA signal}
+\lhead[\today]{\today}
+
+\normalem
+
+\begin{abstract}
+Given signals detected by chaining 20-mers, a method is presented for
+deciding which signals potentially contain real matches.
+\end{abstract}
+
+\section{What is a signal}
+
+Signal has three values associated with it. The amount 'covered', the
+amount 'matched' and the total 'length'.
+%
+The amount covered is the number of
+bases in the mRNA that are contained in least one mer.
+%
+The amount matched is the number of paired bases (for example, position
+$i$ in the cDNA paired with position $j$ in the genomic) covered by a mer.
+%
+The length is the number of mers in the mRNA (roughly equivalent to the
+number of bases in the mRNA that could be covered by a mer, but easier
+to compute).
+
+From these, we can derive two scores, the coverage and the multiplicity.
+The coverage, $\frac{covered}{length}$, represents the fraction of the mRNA
+that we found, while the multiplicity, $\frac{matched}{covered}$, represents
+the amount of the mRNA that we found too many times.
+
+A high multiplicity usually indicates a repeat-containing mRNA. High
+multiplicity and high coverage can indicate that the mRNA is not cDNA.
+
+\section{Filter}
+
+In order to filter signals, we need to decide, for each mRNA, which
+signals are bad, and which are good (duh!), which means that we'll
+need to look at {\em all} signals for a single mRNA.
+
+For the filter presented below, we need to know the best and worst
+coverage values that occur for any signal associated with a specific
+mRNA. Once those are known, the signals can be filtered in any order.
+This is important in the case where the signals are detected
+chromosome by chromosome. Instead of sorting all signals, we can save
+the best and worst coverage for each mRNA.
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|l|}
+\hline
+Switch & Variable & Default Value & Description \\
+\hline
+\hline
+-l & $L$ & 0.2 & \text{Signal spread low range} \\
+-h & $H$ & 0.6 & \text{Signal spread high range} \\
+-v & $V$ & 0.3 & \text{Pass value} \\
+-m & $M$ & 0.3 & \text{Signal quality floor} \\
+-mc & $M_c$ & 0.2 & \text{Minimum signal quality} \\
+-ml & $M_l$ & 150 & \text{Minimum signal size} \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Parameters, default values and descriptions}
+\label{table:defvalues}
+\end{figure}
+
+The filter has six parameters, summarized in Table~/ref{table:defvals}.
+
+If the signals for a specific mRNA are all very similar, it is
+probable that the weaker signals are weak only because of a few
+mismatches that break 20-mers. In this case, we cannot reliably pick
+the signals that are true, and should consider all of them.
+
+On the other hand, if there is a large range in the quality of signals,
+we can safely discard low scoring signals, and still be confident that
+we will find the good stuff.
+
+Therefore, the filter will discard no signals if the range in quality
+values is small, and will gradually discard more, proportional to the
+range. So that we don't discard too much, we limit the increase in
+filtering to $V$ (0.3).
+\begin{align*}
+h &= bestCoverage - worstCoverage \\
+p &= \begin{cases}
+ 0.0 & \text{if $h \le L$} \\
+ V * \frac{h-L}{H-L} & \text{if $L < h < H$} \\
+ V & \text{if $H \le h$}
+ \end{cases} \\
+c &= min(worstCoverage + p \cdot h, M)
+\end{align*}
+
+\begin{figure}
+\begin{center}
+\epsfig{figure=mRNAfilt.eps, silent=, width=4.5in}
+\end{center}
+\caption{The $p$ curve.}
+\label{fig:pcurve}
+\end{figure}
+
+$p$ is the amount of filtering, ranging from minimum (0.0) to maximum
+($V$, a parameter).
+
+The $c$ value computed above is the filtering threshold. Signals with
+coverage below $c$ are considered weak, and are discarded.
+
+If the score range is small ($\le L$), then $c$ will be
+$worstCoverage$, and we do no filtering. If the score range is large
+($\ge H$), then $c$ will be $M$ of the best score. $c$ is the minimum
+coverage that will be accepted. It is derived from the range of
+scores, not the number of scores.
+
+Finally, it is possible that {\em all} signals are good. If we used the
+above filtering we would be discarding the low scoring (but still valid)
+signals. To overcome this, absolute limits $M_c$ and $M_l$ are enforced.
+
+A signal is saved if both of the following conditions are met:
+\begin{enumerate}
+\item ($c <= coverage$)
+\item ($M_c <= coverage$) or ($M_l <= coveredBases$)
+\end{enumerate}
+
+\section{Results}
+This filter is overly permissive, throwing out only signals that are
+obviously garbage.
+
+\end{document}
diff --git a/ESTmapper/ESTmapper.pl b/ESTmapper/ESTmapper.pl
new file mode 100644
index 0000000..85b20e4
--- /dev/null
+++ b/ESTmapper/ESTmapper.pl
@@ -0,0 +1,1462 @@
+#!/usr/bin/perl
+
+# Copyright (c) 2002 PE Corporation (NY) through the Celera Genomics Group
+# Copyright (c) 2003, 2004 Applied Biosystems
+# Copyright (c) 2004, 2005, 2006 Brian Walenz
+
+$| = 1;
+
+# Perl version 5.005_03 is too old, it requires two args to mkdir.
+
+use strict;
+use FindBin;
+use Config; # for @signame
+use lib "$FindBin::Bin/../lib";
+
+use scheduler;
+
+my %prog;
+my %args;
+
+
+################################################################################
+#
+# Utility to run a command and check the exit status (sadly, duplicated
+# in configureESTmapper.pl).
+#
+################################################################################
+
+
+sub runCommand {
+ my $cmd = shift @_;
+
+ print STDERR "$cmd\n";
+
+ my $rc = 0xffff & system($cmd);
+
+ # Pretty much copied from Programming Perl page 230
+
+ return(0) if ($rc == 0);
+
+ # Bunch of busy work to get the names of signals. Is it really worth it?!
+ #
+ my @signame;
+ if (defined($Config{sig_name})) {
+ my $i = 0;
+ foreach my $n (split('\s+', $Config{sig_name})) {
+ $signame[$i] = $n;
+ $i++;
+ }
+ }
+
+ my $error = "ERROR: $cmd\n failed with ";
+
+ if ($rc == 0xff00) {
+ $error .= "$!\n";
+ } elsif ($rc > 0x80) {
+ $rc >>= 8;
+ $error .= "exit status $rc\n";
+ } else {
+ if ($rc & 0x80) {
+ $rc &= ~0x80;
+ $error .= "coredump from ";
+ }
+ if (defined($signame[$rc])) {
+ $error .= "signal $signame[$rc]\n";
+ } else {
+ $error .= "signal $rc\n";
+ }
+ }
+
+ print STDERR $error;
+
+ return(1);
+}
+
+
+################################################################################
+#
+# Command line parsing and configuration
+#
+################################################################################
+
+
+sub setExecutables {
+ my $exechome = "$FindBin::Bin";
+
+ $prog{'ESTmapper'} = "$exechome/ESTmapper.pl";
+ $prog{'seagen'} = "$exechome/seagen";
+ $prog{'mergeCounts'} = "$exechome/mergeCounts";
+ $prog{'filterEST'} = "$exechome/filterEST";
+ $prog{'filterMRNA'} = "$exechome/filterMRNA";
+ $prog{'filterNULL'} = "$exechome/filterNULL";
+ $prog{'sim4db'} = "$exechome/sim4db";
+ $prog{'leaff'} = "$exechome/leaff";
+ $prog{'meryl'} = "$exechome/meryl";
+ $prog{'cleanPolishes'} = "$exechome/cleanPolishes";
+ $prog{'toFILTER'} = "$exechome/filterPolishes";
+ $prog{'sortHits'} = "$exechome/sortHits";
+ $prog{'sortPolishes'} = "$exechome/sortPolishes";
+ $prog{'parseSNPs'} = "$exechome/parseSNP";
+ $prog{'pickBest'} = "$exechome/pickBestPolish";
+ $prog{'positionDB'} = "$exechome/positionDB";
+ $prog{'terminate'} = "$exechome/terminate";
+
+ foreach my $e (keys %prog) {
+ die "Can't find/execute $e ('$prog{$e}')\n" if (! -e $prog{$e});
+ }
+}
+
+
+sub parseArgs (@) {
+ my @ARGS = @_;
+
+ $args{'scriptVersion'} = "10";
+ $args{'startTime'} = time();
+
+ while (scalar(@ARGS) > 0) {
+ my $arg = shift @ARGS;
+
+ if (($arg =~ m/^-dir/) || # depricated
+ ($arg =~ m/^-path/) || # depricated
+ ($arg =~ m/^-outputdir/) ||
+ ($arg =~ m/^-mapdir/)) {
+ $args{'path'} = shift @ARGS;
+ } elsif (($arg =~ m/^-genomedir/) ||
+ ($arg =~ m/-genome/)) { # depricated
+ $args{'genomedir'} = shift @ARGS;
+
+ } elsif (($arg =~ m/^-map(est)/) ||
+ ($arg =~ m/^-map(mrna)/) ||
+ ($arg =~ m/^-map(snp)/)) {
+ $args{'runstyle'} = $1;
+ $args{'queries'} = shift @ARGS;
+ } elsif ($arg =~ m/^-restart/) {
+ $args{'runstyle'} = "restart";
+ $args{'path'} = shift @ARGS;
+ } elsif ($arg =~ m/^-help/) {
+ $args{'runstyle'} = "help";
+ } elsif ($arg =~ m/^-time/) {
+ $args{'runstyle'} = "time";
+
+ } elsif ($arg =~ m/^-verbose/) {
+ $args{'verbose'} = 1;
+ }
+
+
+ #
+ # RUN options
+ #
+ elsif ($arg =~ m/^-runlater/) {
+ $args{'runlater'} = 1;
+ }
+
+ #
+ # LSF options
+ #
+
+ #
+ # SGE options
+ #
+ elsif ($arg =~ m/^-sge$/) {
+ $args{'sgename'} = shift @ARGS;
+ } elsif (($arg =~ m/^-(sgeoptions)/) ||
+ ($arg =~ m/^-(sgesearch)/) ||
+ ($arg =~ m/^-(sgefilter)/) ||
+ ($arg =~ m/^-(sgepolish)/) ||
+ ($arg =~ m/^-(sgefinish)/)) {
+ $args{$1} = shift @ARGS;
+ }
+
+
+ #
+ # search options
+ #
+ elsif (($arg =~ m/^-(searchopts)/) ||
+ ($arg =~ m/^-(localsearches)/) ||
+ ($arg =~ m/^-(searchthreads)/) ||
+ ($arg =~ m/^-(hitsortmemory)/) ||
+ ($arg =~ m/^-(mermaskfile)/) ||
+ ($arg =~ m/^-(merignore)/)) {
+ $args{$1} = shift @ARGS;
+ }
+
+ #
+ # filter options
+ #
+ elsif (($arg =~ m/^-(hitsortmemory)/)) {
+ $args{$1} = shift @ARGS;
+ } elsif ($arg =~ m/^-nofilter/) {
+ $args{'nofilter'} = 1;
+ }
+
+ #
+ # polish options
+ #
+ elsif (($arg =~ m/^-(mincoverage)/) ||
+ ($arg =~ m/^-(minidentity)/) ||
+ ($arg =~ m/^-(minlength)/) ||
+ ($arg =~ m/^-(minsim4coverage)/) ||
+ ($arg =~ m/^-(minsim4identity)/) ||
+ ($arg =~ m/^-(minsim4length)/) ||
+ ($arg =~ m/^-(relink)/) ||
+ ($arg =~ m/^-(alwaysprint)/) ||
+ ($arg =~ m/^-(batchsize)/) ||
+ ($arg =~ m/^-(numbatches)/) ||
+ ($arg =~ m/^-(localpolishes)/)) {
+ $args{$1} = shift @ARGS;
+ } elsif ($arg =~ m/^-interspecies/) {
+ $args{'interspecies'} = 1;
+ } elsif ($arg =~ m/^-aligns/) {
+ $args{'aligns'} = 1;
+ } elsif ($arg =~ m/^-noaligns/) {
+ delete $args{'aligns'};
+ } elsif ($arg =~ m/^-abort/) {
+ $args{'abort'} = 1;
+ } elsif ($arg =~ m/^-yn/) {
+ $args{'nofilter'} = 1;
+ $args{'sim4-yn'} = 1;
+ }
+
+ #
+ # finish options
+ #
+ elsif ($arg =~ m/^-cleanup/) {
+ $args{'cleanup'} = shift @ARGS;
+ } elsif ($arg =~ m/^-nocleanup/) {
+ delete $args{'cleanup'};
+ } elsif ($arg =~ m/^-savetemporary/) {
+ $args{'savetemporary'} = 1;
+ }
+
+ #
+ # Are we installed correctly?
+ #
+ elsif ($arg =~ m/-justtestingifitworks/) {
+ exit(0);
+ }
+
+ else {
+ die "ESTmapper/configure-- unknown option '$arg'\n";
+ }
+ }
+
+ # Check we have a path!
+ #
+ ($args{'path'} eq "") and die "ERROR: ESTmapper/configure-- no directory given.\n";
+
+ #print STDERR "CONF $args{'genomedir'}\n";
+ #print STDERR "CONF $args{'queries'}\n";
+ #print STDERR "CONF $args{'path'}\n";
+
+
+ # Be tolerant of relative paths, but don't use them!
+ #
+ $args{'genomedir'} = "$ENV{'PWD'}/$args{'genomedir'}" if (defined($args{'genomedir'}) && ($args{'genomedir'} !~ m!^/!));
+ $args{'queries'} = "$ENV{'PWD'}/$args{'queries'}" if (defined($args{'queries'}) && ($args{'queries'} !~ m!^/!));
+ $args{'path'} = "$ENV{'PWD'}/$args{'path'}" if (defined($args{'path'}) && ($args{'path'} !~ m!^/!));
+
+
+ # Make some organization
+ #
+ mkdir "$args{'path'}" if (! -d "$args{'path'}");
+ mkdir "$args{'path'}/0-input" if (! -d "$args{'path'}/0-input");
+ mkdir "$args{'path'}/1-search" if (! -d "$args{'path'}/1-search");
+ mkdir "$args{'path'}/2-filter" if (! -d "$args{'path'}/2-filter");
+ mkdir "$args{'path'}/3-polish" if (! -d "$args{'path'}/3-polish");
+
+
+ # If told to restart, suck in the original configration, but
+ # don't overwrite things already defined.
+ #
+ if ($args{'runstyle'} eq "restart") {
+ if (! -e "$args{'path'}/.runOptions") {
+ print STDERR "ESTmapper/restart-- Nothing to restart!\n";
+ exit;
+ }
+
+ delete $args{'runstyle'};
+
+ open(F, "< $args{'path'}/.runOptions") or die "Failed to open '$args{'path'}/.runOptions' to read options.\n";
+ while (<F>) {
+ chomp;
+
+ if (m/\s*(\S+)\s*=\s*(.*)\s*$/) {
+ $args{$1} = $2 if (!defined($args{$1}));
+ } else {
+ die "Invalid runOption line '$_'\n";
+ }
+ }
+ close(F);
+ }
+
+ # Write the current set of args to the runOptions file
+ #
+ open(F, "> $args{'path'}/.runOptions") or die "Failed to open '$args{'path'}/.runOptions' to save options.\n";
+ foreach my $k (keys %args) {
+ #print STDERR "DEBUG $k=$args{$k}\n";
+ print F "$k=$args{$k}\n";
+ }
+ close(F);
+}
+
+
+sub configure {
+ my $path = $args{'path'};
+
+ print STDERR "ESTmapper: Performing a configure.\n";
+
+ ($args{'genomedir'} eq "") and die "ERROR: ESTmapper/configure-- no genomic sequences given.\n";
+ ($args{'queries'} eq "") and die "ERROR: ESTmapper/configure-- no cDNA sequences given.\n";
+
+ (! -f $args{'queries'}) and die "ERROR: ESTmapper/configure-- can't find the cdna sequence '$args{'queries'}'\n";
+
+ # XXX: We should check that the genome dir is valid and complete.
+ #
+ symlink "$args{'genomedir'}", "$path/0-input/genome" if (! -d "$path/0-input/genome");
+
+ # Check the input files exist, create symlinks to them, and find/build index files
+ #
+ symlink "$args{'queries'}", "$path/0-input/cDNA.fasta" if ((! -f "$path/0-input/cDNA.fasta"));
+ symlink "$args{'queries'}idx", "$path/0-input/cDNA.fastaidx" if ((! -f "$path/0-input/cDNA.fastaidx") && (-f "$args{'queries'}idx"));
+
+ if (! -f "$path/0-input/cDNA.fastaidx") {
+ print STDERR "ESTmapper/configure-- Generating the index for '$path/0-input/cDNA.fasta'\n";
+ runCommand("$prog{'leaff'} -F $path/0-input/cDNA.fasta") and die "Failed.\n";
+ }
+
+ # Create a .runInformaiton file, containing supposedly useful information
+ # about this run.
+ #
+ my $time = time();
+ $args{'runInfoFile'} = "$args{'path'}/.runInformation.$time";
+
+ # Write some information and the args to a run info file
+ #
+ open(F, "> $args{'runInfoFile'}");
+ print F "startTime: $time (", scalar(localtime($time)), ")\n";
+ print F "operator: $ENV{'USER'}\n";
+ print F "host: " . `uname -a`;
+ print F "version: $args{'scriptVersion'}\n";
+ print F "parameters:";
+ foreach my $k (keys %args) {
+ print F "$k=$args{$k}\n";
+ }
+ close(F);
+
+ unlink "$args{'path'}/.runInformation";
+ symlink "$args{'path'}/.runInformation.$time", "$args{'path'}/.runInformation";
+
+ print STDERR "ESTmapper: configured.\n";
+}
+
+
+################################################################################
+#
+# Signal Finding
+#
+################################################################################
+
+
+sub submitFilter (@) {
+ my $watch = join ",", @_;
+ my $path = $args{'path'};
+
+ open(F, "> $path/1-search/filter-restart.sh");
+ print F "#!/bin/sh\n";
+ print F "#\n";
+ print F "# Attempt to (re)configure SGE. For reasons Bri doesn't know,\n";
+ print F "# jobs submitted to SGE, and running under SGE, fail to read his\n";
+ print F "# .tcshrc (or .bashrc, limited testing), and so they don't setup\n";
+ print F "# SGE (or ANY other paths, etc) properly. For the record,\n";
+ print F "# interactive SGE logins (qlogin, etc) DO set the environment.\n";
+ print F "#\n";
+ print F ". \$SGE_ROOT/\$SGE_CELL/common/settings.sh\n";
+ print F "/usr/bin/perl $prog{'ESTmapper'} -restart $path\n";
+ close(F);
+
+ my $cmd;
+ $cmd = "qsub -cwd -j y -o $path/stage2.sgeout ";
+ $cmd .= " $args{'sgeoptions'} " if (defined($args{'sgeoptions'}));;
+ $cmd .= " $args{'sgefilter'} " if (defined($args{'sgefilter'}));
+ $cmd .= " -N \"f$args{'sgename'}\" ";
+ $cmd .= " -hold_jid $watch " if ($watch ne "");
+ $cmd .= " $path/1-search/filter-restart.sh";
+
+ die "Failed to submit job to SGE.\n" if (runCommand($cmd));
+}
+
+
+sub submitFinish (@) {
+ my $watch = join ",", @_;
+ my $path = $args{'path'};
+
+ open(F, "> $path/3-polish/finish-restart.sh");
+ print F "#!/bin/sh\n";
+ print F "#\n";
+ print F "# Attempt to (re)configure SGE. For reasons Bri doesn't know,\n";
+ print F "# jobs submitted to SGE, and running under SGE, fail to read his\n";
+ print F "# .tcshrc (or .bashrc, limited testing), and so they don't setup\n";
+ print F "# SGE (or ANY other paths, etc) properly. For the record,\n";
+ print F "# interactive SGE logins (qlogin, etc) DO set the environment.\n";
+ print F "#\n";
+ print F ". \$SGE_ROOT/\$SGE_CELL/common/settings.sh\n";
+ print F "/usr/bin/perl $prog{'ESTmapper'} -restart $path\n";
+ close(F);
+
+ my $cmd;
+ $cmd = "qsub -cwd -j y -o $path/stage3.sgeout ";
+ $cmd .= " $args{'sgeoptions'} " if (defined($args{'sgeoptions'}));;
+ $cmd .= " $args{'sgefinish'} " if (defined($args{'sgefinish'}));
+ $cmd .= " -N \"o$args{'sgename'}\" ";
+ $cmd .= " -hold_jid $watch " if ($watch ne "");
+ $cmd .= " $path/3-polish/finish-restart.sh";
+
+ die "Failed to submit job to SGE.\n" if (runCommand($cmd));
+}
+
+
+sub search {
+ my $startTime = time();
+ my $path = $args{'path'};
+
+ # If we're all done, just get outta here.
+ return if (-e "$path/1-search/allDone");
+
+ my $mersize = ($args{'mersize'} or 20);
+ my $merskip = ($args{'merskip'} or 0);
+ my $searchopts = "";
+
+ $searchopts = "-maxintron 2000000 -singlelength 20 -multiplelength 30 -smallsequence 100" if ($args{'runstyle'} eq "est");
+ $searchopts = "-maxintron 2000000 -singlelength 30 -multiplelength 50 -smallsequence 0" if ($args{'runstyle'} eq "mrna");
+ $searchopts = "-maxintron 2000000 -singlecoverage 0.3 -multiplecoverage 0.3 -smallsequence 10000000 -extendminimum 100 -extendweight 2" if ($args{'runstyle'} eq "snp");
+
+ $searchopts .= $args{'searchopts'};
+
+ my $numproc = ($args{'localsearches'} or 4);
+ my $numthread = ($args{'searchthreads'} or 2);
+
+ my $hitMemory = ($args{'hitsortmemory'} or 600); # Don't change the value without 3-filter
+
+ my $cdnaInInput = int(`$prog{'leaff'} -F $path/0-input/cDNA.fasta -d`);
+
+
+ # Look for a mer masking file, or use the one supplied.
+ #
+ if (!defined($args{'mermaskfile'})) {
+ $args{'merignore'} = 1000 if (!defined($args{'merignore'}));
+ $args{'merignore'} = substr("000000$args{'merignore'}", -4);
+ $args{'mermaskfile'} = "$args{'genomedir'}/frequentMers-ge$args{'merignore'}.fasta";
+ }
+ if (($args{'mermaskfile'} ne "none") && (! -e $args{'mermaskfile'})) {
+ print STDERR "ESTmapper/search-- Didn't find mer mask file '$args{'mermaskfile'}', attempting\n";
+ print STDERR "ESTmapper/search-- create it.\n";
+ my $cmd;
+ $cmd = "$prog{'meryl'}";
+ $cmd .= " -Dt -n $args{'merignore'} ";
+ $cmd .= " -s \"$args{'genomedir'}//genome\"";
+ $cmd .= " > \"$args{'genomedir'}/frequentMers-ge$args{'merignore'}.fasta\"";
+ if (runCommand($cmd)) {
+ die "ESTmapper/search-- Failed to create mask file.\n";
+ }
+ }
+ if (($args{'mermaskfile'} ne "none") && (! -e $args{'mermaskfile'})) {
+ print STDERR "ESTmapper/search-- Can't find mer mask file '$args{'mermaskfile'}'.\n";
+ print STDERR "ESTmapper/search-- Perhaps your genome isn't installed correctly?\n";
+ print STDERR "ESTmapper/search-- Try a different mersize?\n";
+ exit(1);
+ }
+
+
+ open(F, "< $path/0-input/memoryLimit");
+ my $farmMemory = <F>;
+ close(F);
+ chomp $farmMemory;
+
+
+ # Create a bunch of scripts to process
+ #
+ # Rewrite the command everytime. This fixes the problem where
+ # we would, say, change the number of threads...
+ #
+ open(F, "> $path/1-search/search.sh");
+ print F "#!/bin/sh\n";
+ print F "\n";
+ print F "jid=\$SGE_TASK_ID\n";
+ print F "if [ x\$jid = x -o x\$jid = xundefined ] ; then\n";
+ print F " if [ x\$1 = x ] ; then\n";
+ print F " echo \"ERROR: I need a job-id on the command line or in \$SGE_TASK_ID\"\n";
+ print F " exit 1\n";
+ print F " fi\n";
+ print F " jid=`expr \$1 + 1`\n";;
+ print F "fi\n";
+ print F "\n";
+ print F "jid=`head -\$jid $path/0-input/genome/segments | tail -1`\n";
+ print F "\n";
+ print F "if [ -e \"$path/1-search/\$jid.success\" ] ; then\n";
+ print F " exit\n";
+ print F "fi\n";
+ print F "\n";
+ print F "$prog{'seagen'} \\\n";
+ print F " -verbose \\\n" if ($args{'verbose'});
+ print F " -binary \\\n";
+ print F " -mersize $mersize \\\n";
+ print F " -numthreads $numthread \\\n";
+ print F " $searchopts \\\n";
+ print F " -cdna $path/0-input/cDNA.fasta \\\n";
+ print F " -genomic $path/0-input/genome/genome.seqStore \\\n";
+ print F " -positions $path/0-input/genome/seg\$jid.posDB \\\n";
+ print F " -mask $args{'mermaskfile'} \\\n" if ($args{'mermaskfile'} ne "none");
+ print F " -output $path/1-search/\$jid.hits \\\n";
+ print F " -count $path/1-search/\$jid.count \\\n";
+ print F "&& \\\n";
+ print F "touch $path/1-search/\$jid.success\n";
+ close(F);
+
+ chmod 0755, "$path/1-search/search.sh";
+
+
+ # Read the list of segments to figure out which segments we need to run.
+ #
+ my @searchesToRun;
+
+ open(F, "< $path/0-input/genome/segments") or die "Can't open genome segments list!\n";
+ while (<F>) {
+ chomp;
+ if (! -e "$path/1-search/$_.success") {
+ print STDERR "ESTmapper/search-- search segment $_ not computed.\n";
+ push @searchesToRun, $_;
+ }
+ }
+ close(F);
+
+
+
+ # Run searches. If the search terminated properly, the
+ # hit-counts file should exist. Run (maybe re-run) the search if
+ # it isn't there.
+ #
+ if (defined($args{'runlater'})) {
+ my $jobs = join " ", @searchesToRun;
+ print STDERR "ESTmapper/search-- Please run the jobs:\n";
+ print STDERR "ESTmapper/search-- $jobs\n";
+ print STDERR "ESTmapper/search-- using:\n";
+ print STDERR "ESTmapper/search-- $path/1-search/search.sh\n";
+ exit(0);
+ } elsif (defined($args{'sgename'})) {
+
+ if (scalar(@searchesToRun) > 0) {
+ print STDERR "ESTmapper/search-- SGE mode requested; ", scalar @searchesToRun, " processes to compute,\n";
+ print STDERR "ESTmapper/search-- SGE mode requested; each with $numthread threads,\n";
+ print STDERR "ESTmapper/search-- SGE mode requested; $farmMemory MB per process.\n";
+
+ # Don't resubmit jobs that are already done, and do
+ # submit the smallest number of jobs to finish.
+ # Bugs here should be fixed in 2-search.pl as well.
+
+ my @watchJobs;
+
+ my $fJob = shift @searchesToRun;
+ my $lJob = $fJob;
+
+ while (defined($lJob)) {
+ my $nJob = shift @searchesToRun;
+
+ if (($lJob + 1 != $nJob) || (!defined($nJob))) {
+
+ # SGE expects jobs to start at 1, but we start at 0.
+ $fJob++;
+ $lJob++;
+
+ print STDERR "Sumbit $fJob - $lJob (njob=$nJob)\n";
+
+ my $cmd;
+ $cmd = "qsub -cwd -j y -o $path/1-search/sgeout-\\\$TASK_ID ";
+ $cmd .= " $args{'sgeoptions'} " if (defined($args{'sgeoptions'}));
+ $cmd .= " $args{'sgesearch'} " if (defined($args{'sgesearch'}));
+ $cmd .= " -N \"s$args{'sgename'}.$fJob\" ";
+ $cmd .= " -t $fJob-$lJob ";
+ $cmd .= "$path/1-search/search.sh";
+
+ push @watchJobs, "s$args{'sgename'}.$fJob";
+
+ die "Failed to submit job to SGE.\n" if (runCommand($cmd));
+
+ $fJob = $nJob;
+ }
+ $lJob = $nJob;
+ }
+
+ # Submit the filter, and make it wait for the searches, if they were submitted.
+ #
+ submitFilter(@watchJobs);
+
+ print STDERR "ESTmapper/search-- Searches submitted. Rest of run is on the farm.\n";
+
+ exit(0);
+ }
+ } else {
+ print STDERR "ESTmapper/search-- Local mode requested; ", scalar @searchesToRun, " processes to compute,\n";
+ print STDERR "ESTmapper/search-- Local mode requested; $numproc concurrent processes,\n";
+ print STDERR "ESTmapper/search-- Local mode requested; each with $numthread threads.\n";
+
+ # Run the searches. We use the scheduler, then check
+ # everything at the end. This is a little less friendly
+ # to the user, but much easier for the implementor.
+ #
+ if (scalar(@searchesToRun) > 0) {
+ &scheduler::schedulerSetNumberOfProcesses($numproc);
+ foreach my $s (@searchesToRun) {
+ print STDERR "sh $path/1-search/search.sh $s\n";
+ &scheduler::schedulerSubmit("sh $path/1-search/search.sh $s");
+ }
+ &scheduler::schedulerFinish();
+ }
+ }
+
+
+ # See if anything failed.
+ #
+ print STDERR "ESTmapper/search-- checking search output. All should have $cdnaInInput cDNA.\n";
+
+ my $fail = 0;
+
+ open(F, "< $path/0-input/genome/segments") or die "Can't open genome segments list!\n";
+ while (<F>) {
+ chomp;
+
+ # If the hits file is NOT found, remove the count file. Then
+ # figure out how many ESTs we have signals for, and fail if
+ # it's not what we expect.
+
+ unlink "$path/1-search/$_.count" if (! -e "$path/1-search/$_.hits");
+
+ my $c = int(`wc -l < $path/1-search/$_.count`) if (-e "$path/1-search/$_.count");
+
+ if ($c != $cdnaInInput) {
+ print STDERR "ESTmapper/search-- Search $_ failed, only $c signals. Output saved as *.CRASH\n";
+ rename "$path/1-search/$_.count", "$path/1-search/$_.count.CRASH";
+ rename "$path/1-search/$_.hits", "$path/1-search/$_.hits.CRASH";
+ $fail++;
+ }
+ }
+ close(F);
+
+ die "Dang." if ($fail);
+
+ # Hooray! Now we're all done!
+
+ open(F, "> $path/1-search/allDone");
+ close(F);
+
+ print STDERR "ESTmapper/search-- Script finished in ", time() - $startTime, " wall-clock seconds.\n" if (time() > $startTime + 5);
+}
+
+
+################################################################################
+#
+# Signal Filtering
+#
+################################################################################
+
+
+sub filter {
+ my $startTime = time();
+
+ # If we're all done, just get outta here.
+ return if (-e "$args{'path'}/2-filter/filteredHits");
+
+ # If we're supposed to be running on the grid, but we aren't, restart.
+ # This can occur if the searches have finished, but the filter
+ # didn't, and we restart. (also in 5-assemble.pl)
+ #
+ if (defined($args{'sgename'}) && !defined($ENV{'SGE_TASK_ID'})) {
+ submitFilter();
+ print STDERR "ESTmapper/filter-- Restarted LSF execution.\n";
+ exit;
+ }
+
+ my $path = $args{'path'};
+ my $verbose = ($args{'verbose'}) ? "-verbose" : "";
+
+ my $hitMemory = ($args{'hitsortmemory'} or 600); # Don't change the value without 2-search
+
+ print STDERR "ESTmapper: Performing a filter.\n";
+
+
+ # Merge all the hit counts into one list -- this is needed for output filtering!
+ #
+ if (! -e "$path/2-filter/hitCounts") {
+ print STDERR "ESTmapper/filter-- Merging counts.\n";
+ if (runCommand("$prog{'mergeCounts'} $path/1-search/[0-9]*[0-9].count > $path/2-filter/hitCounts")) {
+ unlink "$path/2-filter/hitCounts";
+ die "Failed.\n";
+ }
+ }
+
+ #
+ # Setup the filtering and sorting
+ #
+
+ # No verbose for filterNULL!
+ #
+ my $fcmd;
+
+ # bpw, 20051005, this isn't the perfect EST filter, but it does
+ # nearly as good as the best filter I've seen, and produces
+ # significantly fewer false positives.
+
+ if ($args{'nofilter'} eq 1) {
+ $fcmd = "$prog{'filterNULL'} $path/1-search/*hits > $path/2-filter/filtHits";
+ } elsif ($args{'runstyle'} eq "est") {
+ $fcmd = "$prog{'filterEST'} -u 200000000000 -r 0 -log $path/2-filter/filterLog $path/1-search/*hits > $path/2-filter/filtHits";
+ } elsif ($args{'runstyle'} eq "snp") {
+ $fcmd = "$prog{'filterMRNA'} $verbose $path/1-search/*hits > $path/2-filter/filtHits";
+ } elsif ($args{'runstyle'} eq "mrna") {
+ $fcmd = "$prog{'filterMRNA'} $verbose $path/1-search/*hits > $path/2-filter/filtHits";
+ } else {
+ print STDERR "ESTmapper/filter-- nofilter = $args{'nofilter'}\n";
+ print STDERR "ESTmapper/filter-- runstyle = $args{'runstyle'}\n";
+ die "ESTmapper/filter-- Don't know how to filter!\n";
+ }
+
+ print STDERR "ESTmapper/filter-- Filtering.\n";
+ if (runCommand($fcmd)) {
+ unlink "$path/2-filter/filtHits";
+ die "Failed.\n";
+ }
+
+ my $scmd = "$prog{'sortHits'} $verbose -m $hitMemory -t $path/2-filter $path/2-filter/filtHits > $path/2-filter/filteredHits";
+
+ print STDERR "ESTmapper/filter-- Sorting.\n";
+ if (runCommand($scmd)) {
+ unlink "$path/2-filter/filteredHits";
+ die "Failed.\n";
+ }
+
+ die "ESTmapper/filter-- FATAL: filter and sort produced no hits?\n" if (-z "$path/2-filter/filteredHits");
+
+ print STDERR "ESTmapper: Filter script finished in ", time() - $startTime, " wall-clock seconds.\n" if (time() > $startTime + 5);
+}
+
+
+################################################################################
+#
+# Signal Polishing
+#
+################################################################################
+
+
+sub polish {
+ my $startTime = time();
+
+ # If we're all done, just get outta here.
+ return if (-e "$args{'path'}/3-polish/allDone");
+
+ my $path = $args{'path'};
+
+ my $mini = ($args{'minidentity'} or 95);
+ my $minc = ($args{'mincoverage'} or 50);
+ my $minl = ($args{'minlength'} or 0);
+
+ my $minsim4i = ($args{'minsim4identity'} or 90);
+ my $minsim4c = ($args{'minsim4coverage'} or 45);
+ my $minsim4l = ($args{'minsim4length'} or 0);
+
+ my $relink = "-H $args{'relink'}" if ($args{'relink'});
+ my $always = "-alwaysprint $args{'alwaysprint'}" if ($args{'alwaysprint'});
+
+ my $batchsize = ($args{'batchsize'} or 0);
+ my $numbatches = ($args{'numbatches'} or 256);
+
+ my $numproc = ($args{'localpolishes'} or 4);
+
+ my $aligns = "-aligns" if ($args{'aligns'});
+ my $abort = "-Mp 0.25 -Ma 10000" if ($args{'abort'});
+ my $interspecies = "-interspecies" if ($args{'interspecies'});
+
+
+
+ # Save the parameters, these are used on later invocations of
+ # polish, and in filter to make sure the user isn't an idiot.
+ #
+ if (-e "$path/3-polish/parameters") {
+ print STDERR "ESTmapper/polish-- Using original parameters.\n";
+
+ open(F, "< $path/3-polish/parameters");
+ $numbatches = int(<F>);
+ $batchsize = int(<F>);
+ $mini = <F>; chomp $mini;
+ $minc = <F>; chomp $minc;
+ $minl = <F>; chomp $minl;
+ $minsim4i = <F>; chomp $minsim4i;
+ $minsim4c = <F>; chomp $minsim4c;
+ $minsim4l = <F>; chomp $minsim4l;
+ $relink = <F>; chomp $relink;
+ $always = <F>; chomp $always;
+ $aligns = <F>; chomp $aligns;
+ $abort = <F>; chomp $abort;
+ $interspecies = <F>; chomp $interspecies;
+ close(F);
+
+ print STDERR "ESTmapper/polish-- Polish quality suitable for $minsim4i percent identity and\n";
+ print STDERR "ESTmapper/polish-- $minsim4c percent coverage\n";
+ print STDERR "ESTmapper/polish-- To rerun polishes at a different quality level,\n";
+ print STDERR "ESTmapper/polish-- remove the 3-polish directory.\n";
+ } else {
+
+ # Do a little error checking; if both $batchsize and
+ # $numbatches are zero, set $batchsize to make 256 batches.
+ #
+ if (($batchsize == 0) && ($numbatches == 0)) {
+ $numbatches = 256;
+ }
+
+ # If $batchsize is not specified, compute it.
+ #
+ if ($batchsize == 0) {
+ $batchsize = int(`wc -l < $path/2-filter/filteredHits` / $numbatches) + 1;
+ $batchsize = 10000 if ($batchsize < 10000);
+ }
+
+ # Adjust the sim4 qualities based on the final quality desired
+ #
+ $mini = 0 if ($mini < 0);
+ $minc = 0 if ($minc < 0);
+ $minl = 0 if ($minl < 0);
+
+ $minsim4i = $mini - 5 if ($mini - 5 < $minsim4i);
+ $minsim4c = $minc - 5 if ($minc - 5 < $minsim4c);
+ $minsim4l = $minl if ($minl < $minsim4l);
+
+ $minsim4i = 0 if ($minsim4i < 0);
+ $minsim4c = 0 if ($minsim4c < 0);
+ $minsim4l = 0 if ($minsim4l < 0);
+
+ # Save the parameters
+ #
+ open(F, "> $path/3-polish/parameters");
+ print F "$numbatches\n$batchsize\n";
+ print F "$mini\n$minc\n$minl\n";
+ print F "$minsim4i\n$minsim4c\n$minsim4l\n";
+ print F "$relink\n$always\n$aligns\n$abort\n$interspecies\n";
+ close(F);
+ }
+
+
+ # Build the sim4 command
+ #
+ open(F, "> $path/3-polish/polish.sh");
+ print F "#!/bin/sh\n";
+ print F "\n";
+ print F "jid=\$SGE_TASK_ID\n";
+ print F "if [ x\$jid = x -o x\$jid = xundefined ] ; then\n";
+ print F " if [ x\$1 = x ] ; then\n";
+ print F " echo \"ERROR: I need a job-id on the command line or in \$SGE_TASK_ID\"\n";
+ print F " exit 1\n";
+ print F " fi\n";
+ print F " jid=`expr \$1 + 1`\n";;
+ print F "fi\n";
+ print F "\n";
+ print F "jid=`head -\$jid $path/3-polish/partitions | tail -1`\n";
+ print F "\n";
+ print F "if [ -e \"$path/3-polish/\$jid.success\" ] ; then\n";
+ print F " exit\n";
+ print F "fi\n";
+ print F "\n";
+ print F "$prog{'sim4db'} \\\n";
+ print F " -cdna $path/0-input/cDNA.fasta \\\n";
+ print F " -genomic $path/0-input/genome/genome.seqStore \\\n";
+ print F " $aligns \\\n" if ($aligns ne "");
+ print F " $always \\\n" if ($always ne "");
+ print F " $relink \\\n" if ($relink ne "");
+ print F " $abort \\\n" if ($abort ne "");
+ print F " $interspecies \\\n" if ($interspecies ne "");
+ print F " -cut 0.6 \\\n";
+ print F " -mincoverage $minsim4c \\\n";
+ print F " -minidentity $minsim4i \\\n";
+ print F " -minlength $minsim4l \\\n";
+ print F " -script $path/3-polish/\$jid.sim4script \\\n";
+ print F " -output $path/3-polish/\$jid.sim4db \\\n";
+ print F " -YN $path/3-polish/\$jid.yn \\\n" if ($args{'sim4-yn'} == 1);
+ print F "&& \\\n";
+ print F "touch $path/3-polish/\$jid.success\n";
+ close(F);
+
+
+ # Splits the filteredHits into several pieces, and outputs a script
+ # that runs sim4db on those pieces.
+ #
+ if (! -e "$path/3-polish/partitions") {
+ print STDERR "ESTmapper/polish-- Creating scripts with $batchsize lines in each.\n";
+
+ my @idxs;
+ my $idx = "0000";
+
+ open(H, "< $path/2-filter/filteredHits");
+ while (!eof(H)) {
+ my $c = 0;
+
+ open(F, "> $path/3-polish/$idx.sim4script");
+ while (($c < $batchsize) && (!eof(H))) {
+ $_ = <H>;
+ print F $_;
+ $c++;
+ }
+ close(F);
+
+ push @idxs, "$idx\n";
+ $idx++;
+ }
+ close(H);
+
+ print STDERR "ESTmapper/polish-- Created $idx scripts.\n";
+
+ open(S, "> $path/3-polish/partitions");
+ print S @idxs;
+ close(S);
+ }
+
+
+ # Build a list of things to run.
+ #
+ my @jobsToRun;
+
+ open(F, "< $path/3-polish/partitions");
+ while (<F>) {
+ chomp;
+ push @jobsToRun, $_ if (! -e "$path/3-polish/$_.success");
+ }
+ close(F);
+
+
+ # Wipe any summaries, cDNA-* and polished files if we need to polish more stuff.
+ #
+ if (scalar(@jobsToRun) > 0) {
+ unlink "$path/cDNA-good.fasta";
+ unlink "$path/cDNA-goodshort.fasta";
+ unlink "$path/cDNA-lowquality.fasta";
+ unlink "$path/cDNA-missing.fasta";
+ unlink "$path/cDNA-repeat.fasta";
+ unlink "$path/cDNA-zero.fasta";
+ unlink "$path/polishes-aborted";
+ unlink "$path/polishes-good";
+ unlink "$path/polishes-goodshort";
+ unlink "$path/polishes-lowquality";
+ unlink "$path/summary";
+
+ # Display what parameters we are using
+ #
+ print STDERR "ESTmapper/polish-- more polishes to compute.\n";
+ print STDERR "ESTmapper/polish-- minidentity = $mini ($minsim4i)\n";
+ print STDERR "ESTmapper/polish-- mincoverage = $minc ($minsim4c)\n";
+ print STDERR "ESTmapper/polish-- minlength = $minl ($minsim4l)\n";
+ print STDERR "ESTmapper/polish-- relink = $relink\n";
+ print STDERR "ESTmapper/polish-- always = $always\n";
+ print STDERR "ESTmapper/polish-- aligns = $aligns\n";
+ print STDERR "ESTmapper/polish-- abort = $abort\n";
+ print STDERR "ESTmapper/polish-- interspecies = $interspecies\n";
+
+
+ # Run things, or tell the user to do it for us.
+ #
+ if (defined($args{'runlater'})) {
+ print STDERR "ESTmapper/polish-- Please run the jobs in\n";
+ print STDERR "ESTmapper/polish-- $path/3-polish/run.sh\n";
+ exit(0);
+ } elsif (defined($args{'sgename'})) {
+ print STDERR "ESTmapper/polish-- Submitting to SGE.\n";
+
+ # Don't resubmit jobs that are already done, and do
+ # submit the smallest number of jobs to finish.
+ # Bugs here should be fixed in 2-search.pl as well.
+
+ my @watchJobs;
+
+ my $fJob = shift @jobsToRun;
+ my $lJob = $fJob;
+
+ while (defined($lJob)) {
+ my $nJob = shift @jobsToRun;
+
+ if (($lJob + 1 != $nJob) || (!defined($nJob))) {
+
+ # SGE expects jobs to start at 1, but we start at 0.
+ $fJob++;
+ $lJob++;
+
+ print STDERR "Sumbit $fJob - $lJob (njob=$nJob)\n";
+
+ my $cmd;
+ $cmd = "qsub -cwd -j y -o $path/3-polish/sgeout-\\\$TASK_ID ";
+ $cmd .= " $args{'sgeoptions'} " if (defined($args{'sgeoptions'}));;
+ $cmd .= " $args{'sgepolish'} " if (defined($args{'sgepolish'}));
+ $cmd .= " -N \"p$args{'sgename'}.$fJob\" ";
+ $cmd .= " -t $fJob-$lJob ";
+ $cmd .= "$path/3-polish/polish.sh";
+
+ push @watchJobs, "p$args{'sgename'}.$fJob";
+
+ die "Failed to submit job to SGE.\n" if (runCommand($cmd));
+
+ $fJob = $nJob;
+ }
+ $lJob = $nJob;
+ }
+
+ submitFinish(@watchJobs);
+
+ print STDERR "ESTmapper/polish-- Finish submitted. See ya later!\n";
+
+ exit(0);
+ } else {
+ print STDERR "ESTmapper/polish-- Running locally, $numproc at a time.\n";
+
+ &scheduler::schedulerSetNumberOfProcesses($numproc);
+
+ foreach my $cmd (@jobsToRun) {
+ &scheduler::schedulerSubmit("/bin/sh $path/3-polish/polish.sh $cmd");
+ }
+
+ &scheduler::schedulerFinish();
+
+ #unlink "$path/3-polish/run.sh";
+ }
+ }
+
+
+ # Make sure that all the polishes are finished and OK.
+ # If not, print dire warnings and exit.
+ #
+ my $fail = 0;
+
+ open(F, "< $path/3-polish/partitions") or die "Failed to open '$path/3-polish/partitions'\n";;
+ while (<F>) {
+ chomp;
+ if (! -e "$path/3-polish/$_.success") {
+ $fail++;
+ print STDERR "ESTmapper/polish-- segment $_ failed.\n";
+ }
+ }
+ close(F);
+
+ die "Dang." if ($fail);
+
+ # Hooray! Now we're all done!
+
+ open(F, "> $args{'path'}/3-polish/allDone");
+ close(F);
+
+ print STDERR "ESTmapper: Polish script finished in ", time() - $startTime, " wall-clock seconds.\n" if (time() > $startTime + 5);
+}
+
+
+################################################################################
+#
+# Output
+#
+################################################################################
+
+
+# This is way too complicated.
+#
+# 1) Collect output from 4-polish, put into polishes-good
+# 2) Filter -> polishes-best
+#
+# Given as input a single polishes file and a cdna file,
+# we need an executable that:
+# Generate stats on mapping, good and best, missing, zero
+# Filter cDNA to good, missing, zero
+
+
+sub assembleOutput {
+ my $startTime = time();
+
+ my $path = $args{'path'};
+ my $mini = ($args{'minidentity'} or 95);
+ my $minc = ($args{'mincoverage'} or 50);
+ my $minl = ($args{'minlength'} or 0);
+
+ my $intronLimit = $args{'cleanup'} or 100000;
+
+ print STDERR "ESTmapper: Performing an assembleOutput.\n";
+
+ (($mini < 0) || ($mini > 100)) and die "ERROR: ESTmapper/assembleOutput-- supply a value 0 <= x <= 100 for minidentity!\n";
+ (($minc < 0) || ($minc > 100)) and die "ERROR: ESTmapper/assembleOutput-- supply a value 0 <= x <= 100 for mincoverage!\n";
+ ($minl < 0) and die "ERROR: ESTmapper/assembleOutput-- supply a value x >= 0 for minlength!\n";
+
+
+
+ # Check that the filtering is compatable with the polishing.
+ #
+ if (-e "$path/3-polish/parameters") {
+ open(F, "< $path/3-polish/parameters");
+ $_ = <F>;
+ $_ = <F>;
+
+ my $miniL = int(<F>); # Quality values used for last filtering
+ my $mincL = int(<F>);
+ my $minlL = int(<F>);
+
+ my $miniP = int(<F>); # Quality values used for polishing
+ my $mincP = int(<F>);
+ my $minlP = int(<F>);
+ close(F);
+
+ if ($mini < $miniP) {
+ printf STDERR "ESTmapper/assembleOutput-- WARNING: Percent identity quality level too low for existing polishing!\n";
+ printf STDERR "ESTmapper/assembleOutput-- WARNING: Polished at percent align-sequence identity = %3d, requested filtration at %3d.\n", $miniP, $mini;
+ }
+ if ($minc < $mincP) {
+ printf STDERR "ESTmapper/assembleOutput-- WARNING: Coverage quality level too low for existing polishing!\n";
+ printf STDERR "ESTmapper/assembleOutput-- WARNING: Polished at percent query-sequence identity = %3d, requested filtration at %3d.\n", $mincP, $minc;
+ }
+ if ($minl < $minlP) {
+ printf STDERR "ESTmapper/assembleOutput-- WARNING: Length quality level too low for existing polishing!\n";
+ printf STDERR "ESTmapper/assembleOutput-- WARNING: Polished at length = %3d, requested filtration at %3d.\n", $minlP, $minl;
+ }
+
+ # If the filter quality has changed, we need to refilter. Nuke
+ # the filterLevel file, print a message.
+ #
+ if (($mini != $miniL) ||
+ ($minc != $mincL) ||
+ ($minl != $minlL)) {
+ print STDERR "ESTmapper/assembleOutput-- filtering criteria changed; refiltering.\n";
+
+ printf STDERR "ESTmapper/assembleOutput-- identity: percent align-sequence identity: old=%3d new=%3d\n", $miniL, $mini;
+ printf STDERR "ESTmapper/assembleOutput-- coverage: percent query-sequence identity: old=%3d new=%3d\n", $mincL, $minc;
+ printf STDERR "ESTmapper/assembleOutput-- length: length in bp of match: old=%3d new=%3d\n", $minlL, $minl;
+
+ unlink "$path/polishes-good";
+ unlink "$path/polishes-best";
+ unlink "$path/polishes-lowquality";
+ unlink "$path/summary";
+ }
+ } else {
+ die "ESTmapper/assemblyOutput-- ERROR: Couldn't find polishing parameters. Script error.\n";
+ }
+
+
+
+ # If we're supposed to be running on LSF, but we aren't, restart.
+ # This can occur if the searches have finished, but the filter
+ # didn't, and we restart. (also in 3-filter.pl)
+ #
+ if (defined($args{'sgename'}) && !defined($ENV{'SGE_TASK_ID'})) {
+ submitFinish();
+ print STDERR "ESTmapper/filter-- Restarted LSF execution.\n";
+ exit;
+ }
+
+
+
+ if (! -e "$path/polishes-good") {
+ print STDERR "ESTmapper/assembleOutput-- filtering polishes by quality.\n";
+
+ print STDERR "ESTmapper/assembleOutput-- identity: percent align-sequence identity: $mini\n";
+ print STDERR "ESTmapper/assembleOutput-- coverage: percent query-sequence identity: $minc\n";
+ print STDERR "ESTmapper/assembleOutput-- length: length in bp of match: $minl\n";
+
+ # Find all the polishes, run them through the cleaner, and filter by quality.
+ #
+ my $cmd;
+ $cmd = "find $path/3-polish/ -name '*.sim4db' -print | sort | xargs -n 100 cat | ";
+ $cmd .= "$prog{'cleanPolishes'} -threshold $intronLimit -savejunk | " if (defined($args{'cleanup'}));
+ $cmd .= "$prog{'toFILTER'} -c $minc -i $mini -l $minl -o $path/polishes-good -j $path/polishes-aborted > /dev/null";
+
+ if (runCommand($cmd)) {
+ unlink "$path/polishes-good";
+ unlink "$path/polishes-aborted";
+ die "Failed.\n";
+ }
+
+ unlink "$path/polishes-best";
+ unlink "$path/cDNA-good.fasta";
+ unlink "$path/cDNA-missing.fasta";
+ unlink "$path/cDNA-repeat.fasta";
+ unlink "$path/cDNA-zero.fasta";
+ unlink "$path/summary";
+ }
+
+
+ if (! -e "$path/polishes-best") {
+ if ($args{'runstyle'} eq "mrna") {
+ print STDERR "ESTmapper/assembleOutput-- Picking the best mRNA polish.\n";
+ if (runCommand("$prog{'sortPolishes'} -m 400 -c < $path/polishes-good | $prog{'pickBest'} -mrna > $path/polishes-best")) {
+ unlink "$path/polishes-best";
+ die "Failed.";
+ }
+ } elsif ($args{'runstyle'} eq "est") {
+ print STDERR "ESTmapper/assembleOutput-- Picking the best EST polish.\n";
+ if (runCommand("$prog{'sortPolishes'} -m 400 -c < $path/polishes-good | $prog{'pickBest'} -est > $path/polishes-best")) {
+ unlink "$path/polishes-best";
+ die "Failed.";
+ }
+ } else {
+ print STDERR "ESTmapper/assembleOutput-- Not mRNA and not EST, so not picking the best polish.\n";
+ }
+ }
+
+ #
+ # Segregate the sequences
+ #
+
+ # XXXX if the filter prints a list of repeats, we should add those here!
+
+ if (! -e "$path/cDNA-good.fasta") {
+ my $iid = 0;
+ open(F, "< $path/2-filter/hitCounts");
+ open(G, "> $path/zero-hit-iid");
+ while (<F>) {
+ if ($_ == 0) {
+ print G "$iid\n";
+ }
+ $iid++;
+ }
+ close(G);
+ close(F);
+
+ my $cmd;
+ $cmd = "$prog{'terminate'}";
+ $cmd .= " -P $path/polishes-best $path/cDNA-best.fasta";
+ $cmd .= " -P $path/polishes-good $path/cDNA-good.fasta";
+ $cmd .= " -I $path/zero-hit-iid $path/cDNA-zero.fasta";
+ $cmd .= " -O $path/cDNA-missing.fasta";
+ $cmd .= " -i $path/0-input/cDNA.fasta";
+ print $cmd;
+ if (runCommand($cmd)) {
+ rename "$path/cDNA-good.fasta", "$path/cDNA-good.fasta.FAILED";
+ rename "$path/cDNA-missing.fasta", "$path/cDNA-missing.fasta.FAILED";
+ rename "$path/cDNA-zero.fasta", "$path/cDNA-zero.fasta.FAILED";
+ die "Failed.\n";
+ }
+
+ unlink "zero-hit-iid";
+ }
+
+ #
+ # Summarize
+ #
+
+ if ((! -e "$path/summary") || (-z "$path/summary")) {
+ my ($mat, $est, $scf);
+
+ open(F, "> $path/summary");
+
+ print STDERR "ESTmapper/assembleOutput-- counting 'good' matches.\n";
+ ($mat, $est, $scf) = summarizePolishes("$path/polishes-good");
+ print F "GOOD: >= $mini% identity, >= $minc% composite, >= $minl bp\n";
+ if ($mat > 0) {
+ print F "cDNA-genomic matches $mat matches ($est different cDNA and $scf genomic)\n";
+ print F "Matches per cDNA ", int(10000 * $mat / $est) / 10000.0, " matches/cDNA\n";
+ print F "Matches per genomic ", int(10000 * $mat / $scf) / 10000.0, " matches/genomic\n";
+ } else {
+ print F "cDNA-genomic matches None.\n";
+ }
+ print F "\n";
+
+ print STDERR "ESTmapper/assembleOutput-- counting cDNA.\n";
+ print F "cDNA COUNTS:\n";
+ my $cnttotl = int(`grep -c '^>' $path/0-input/cDNA.fasta`);
+ my $cntgood = int(`grep -c '^>' $path/cDNA-good.fasta`);
+ my $cntmiss = int(`grep -c '^>' $path/cDNA-missing.fasta`);
+ my $cntrept = int(`grep -c '^>' $path/cDNA-repeat.fasta`) if (-e "$path/cDNA-repeat.fasta");
+ my $cntzero = int(`grep -c '^>' $path/cDNA-zero.fasta`);
+
+ printf F "cDNA: %8d\n", $cnttotl, "\n";
+ printf F "cDNA-good: %8d (%8.4f%%)\n", $cntgood, 100 * $cntgood / $cnttotl;
+ printf F "cDNA-missing: %8d (%8.4f%%)\n", $cntmiss, 100 * $cntmiss / $cnttotl;
+ printf F "cDNA-repeat: %8d (%8.4f%%)\n", $cntrept, 100 * $cntrept / $cnttotl if (-e "$path/cDNA-repeat.fasta");
+ printf F "cDNA-zero: %8d (%8.4f%%)\n", $cntzero, 100 * $cntzero / $cnttotl;
+ }
+
+
+ #
+ # All done!
+ #
+ if ($args{'savetemporary'} != 1) {
+ if (runCommand("rm -rf $path/1-search $path/2-filter $path/3-polish")) {
+ print STDERR "ESTmapper/assembleOutput-- WARNING: Failed to remove temporary directories.\n";
+ }
+ }
+
+
+ print STDERR "ESTmapper: assembleOutput script finished in ", time() - $startTime, " wall-clock seconds.\n" if (time() > $startTime + 5);
+}
+
+
+
+######################################################################
+#
+# Generates a report on a set of polishes.
+#
+# number of cDNA-scaffold matches
+# number of different cDNA sequences in the set
+# number of different scaffolds in the set
+#
+sub summarizePolishes {
+ my (@files) = @_;
+
+ my %est;
+ my %scf;
+ my $mat = 0;
+ my $ests = 0;
+ my $scfs = 0;
+
+ foreach my $infile (@files) {
+ open(INPUT, "< $infile");
+
+ while (<INPUT>) {
+ if (m/^sim4begin$/) {
+ $mat++;
+ } elsif (m/^edef=/) {
+ $ests++;
+ $est{$_} = 1;
+ } elsif (m/^ddef=/) {
+ $scfs++;
+ $scf{$_} = 1;
+ }
+ }
+
+ close(INPUT);
+ }
+
+ if (($ests != $mat) || ($scfs != $mat)) {
+ print STDERR "WARNING: summarizePolishes counted\n";
+ print STDERR " $mat matches\n";
+ print STDERR " $ests cDNA deflines\n";
+ print STDERR " $scfs scaffold deflines\n";
+ print STDERR " The number of deflines and the number of matches should be the same!\n";
+ }
+
+ return($mat, (scalar (keys %est)), (scalar (keys %scf)));
+}
+
+
+################################################################################
+#
+# Utilities for Main
+#
+################################################################################
+
+
+sub parseSNP {
+ # Parse the SNPs out
+ #
+ if (! -e "$args{'path'}/snps-parsed") {
+ print STDERR "ESTmapper-- Parsing the SNPs\n";
+
+ # Sort, if needed.
+ #
+ if (! -e "$args{'path'}/polishes-good.sorted") {
+ print STDERR "ESTmapper-- Sorting polishes by sequence ID; using 2GB memory maximum.\n";
+ if (runCommand("$prog{'sortPolishes'} -m 2000 -c < $args{'path'}/polishes-good > $args{'path'}/polishes-good.sorted")) {
+ unlink "$args{'path'}/polishes-good.sorted";
+ die "Failed to sort the polishes.\n";
+ }
+ }
+
+ # Parse the options, looking for SNP specific ones
+ #
+ my @ARGS = @ARGV;
+ my $snpdelimiter = "";
+ my $snpsizetag = "";
+ my $snppostag = "";
+ my $snpoffset = "";
+ my $snpoutformat = "";
+
+ while (scalar @ARGS > 0) {
+ my $arg = shift @ARGS;
+
+ if ($arg eq "-snpdelimiter") {
+ $arg = shift @ARGS;
+ $snpdelimiter = "-d \"$arg\"";
+ } elsif ($arg eq "-snpsizetag") {
+ $arg = shift @ARGS;
+ $snpsizetag = "-s \"$arg\"";
+ } elsif ($arg eq "-snppostag") {
+ $arg = shift @ARGS;
+ $snppostag = "-p \"$arg\"";
+ } elsif ($arg eq "-snpoffset") {
+ $arg = shift @ARGS;
+ $snpoffset = "-o $arg";
+ } elsif ($arg eq "-snpoutformat") {
+ $arg = shift @ARGS;
+ $snpoutformat = "-format $arg";
+ }
+ }
+
+ # PARSE!
+ #
+ if (runCommand("$prog{'parseSNPs'} $snpdelimiter $snpsizetag $snppostag $snpoffset $snpoutformat -F $args{'path'}/snps-failed -O $args{'path'}/snps-parsed < $args{'path'}/polishes-good.sorted > $args{'path'}/summary-snps")) {
+ unlink "$args{'path'}/snps-failed";
+ unlink "$args{'path'}/snps-parsed";
+ unlink "$args{'path'}/summary-snps";
+ die "Failed to parse SNP locations from polishes.\n";
+ }
+ }
+}
+
+
+sub sTOhms ($) {
+ my ($s, $m, $h) = @_;
+ $h = $s / 3600;
+ $m = int(($h - int($h)) * 60);
+ $h = int($h);
+ $s = int($s);
+ return($h,$m,$s);
+}
+
+
+################################################################################
+#
+# Main
+#
+################################################################################
+
+
+setExecutables();
+parseArgs(@ARGV);
+
+if ($args{'runstyle'} eq "est") {
+ configure();
+ search();
+ filter();
+ polish();
+ assembleOutput();
+} elsif ($args{'runstyle'} eq "mrna") {
+ $args{'relink'} = 1000;
+ $args{'abort'} = 1;
+
+ configure();
+ search();
+ filter();
+ polish();
+ assembleOutput();
+} elsif ($args{'runstyle'} eq "snp") {
+ $args{'minidentity'} = 95;
+ $args{'mincoverage'} = 80;
+
+ configure();
+ search();
+ filter();
+ polish();
+ assembleOutput();
+ parseSNP();
+} else {
+ print STDERR "Basic help N/A.\n";
+}
+
+print STDERR "ESTmapper: script finished everything in ", time() - $args{'startTime'}, " wall-clock seconds.\n" if (time() != $args{'startTime'});
+
+
+if (-e $args{'runInforFile'}) {
+ my $time = time();
+
+ open(F, ">> $args{'runInforFile'}");
+ print F "endTime: $time (", scalar(localtime($time)), ")\n";
+ close(F);
+}
+
+exit(0);
diff --git a/ESTmapper/Make.include b/ESTmapper/Make.include
new file mode 100644
index 0000000..96a7a7b
--- /dev/null
+++ b/ESTmapper/Make.include
@@ -0,0 +1,21 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBKMER/ :=$(realpath $/../libkmer/)/
+LIBSIM4/ :=$(realpath $/../libsim4/)/
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBKMER/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBSIM4/} -I${LIBUTL/})
+
+$/.CXX_SRCS := $/mergeCounts.C $/terminate.C
+$/.CXX_EXES := $/mergeCounts $/terminate
+
+$/.PERL_EXES := $/ESTmapper.pl $/configureESTmapper.pl $/runConcurrently.pl
+$/.PERL_LIBS := $/scheduler.pm
+
+$/mergeCounts: $/mergeCounts.o
+$/terminate: $/terminate.o ${LIBSIM4/}libsim4.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$/.CLEAN := $/*.o
+$/.REAL-CLEAN := $/mergeCounts $/terminate
diff --git a/ESTmapper/configureESTmapper.pl b/ESTmapper/configureESTmapper.pl
new file mode 100644
index 0000000..7691ccf
--- /dev/null
+++ b/ESTmapper/configureESTmapper.pl
@@ -0,0 +1,416 @@
+#!/usr/bin/perl
+
+use strict;
+use FindBin;
+use Config; # for @signame
+use lib "$FindBin::Bin/util";
+
+my $exechome = "$FindBin::Bin";
+my $leaff = "$exechome/leaff";
+my $posdb = "$exechome/positionDB";
+my $meryl = "$exechome/meryl";
+
+my $genome = undef;
+my $genomedir = undef;
+my $mersize = 20;
+my $merskip = 0;
+my $memory = 1000;
+my $segments = 0;
+my $local = 1;
+my $sge = undef;
+my $sgename = "EMconfig";
+
+
+################################################################################
+#
+# Utility to run a command and check the exit status (sadly, duplicated
+# in configureESTmapper.pl).
+#
+################################################################################
+
+
+sub runCommand {
+ my $cmd = shift @_;
+
+ print STDERR "$cmd\n";
+
+ my $rc = 0xffff & system($cmd);
+
+ # Pretty much copied from Programming Perl page 230
+
+ return(0) if ($rc == 0);
+
+ # Bunch of busy work to get the names of signals. Is it really worth it?!
+ #
+ my @signame;
+ if (defined($Config{sig_name})) {
+ my $i = 0;
+ foreach my $n (split('\s+', $Config{sig_name})) {
+ $signame[$i] = $n;
+ $i++;
+ }
+ }
+
+ my $error = "ERROR: $cmd\n failed with ";
+
+ if ($rc == 0xff00) {
+ $error .= "$!\n";
+ } elsif ($rc > 0x80) {
+ $rc >>= 8;
+ $error .= "exit status $rc\n";
+ } else {
+ if ($rc & 0x80) {
+ $rc &= ~0x80;
+ $error .= "coredump from ";
+ }
+ if (defined($signame[$rc])) {
+ $error .= "signal $signame[$rc]\n";
+ } else {
+ $error .= "signal $rc\n";
+ }
+ }
+
+ print STDERR $error;
+
+ return(1);
+}
+
+
+################################################################################
+#
+# Main
+#
+################################################################################
+
+
+while (scalar(@ARGV)) {
+ my $arg = shift @ARGV;
+
+ if ($arg eq "-genome") {
+ $genome = shift @ARGV;
+ } elsif ($arg eq "-genomedir") {
+ $genomedir = shift @ARGV;
+ } elsif ($arg eq "-mersize") {
+ $mersize = int(shift @ARGV);
+ } elsif ($arg eq "-merskip") {
+ $merskip = int(shift @ARGV);
+ } elsif ($arg eq "-memory") {
+ $memory = int(shift @ARGV);
+ } elsif ($arg eq "-segments") {
+ $segments = int(shift @ARGV);
+ } elsif ($arg eq "-sge") {
+ $local = undef;
+ $sge = shift @ARGV;
+ } elsif ($arg eq "-sgename") {
+ $sgename = shift @ARGV;
+ } elsif ($arg eq "-local") {
+ $local = 1;
+ } elsif ($arg eq "-h") {
+ undef $genome;
+ undef $genomedir;
+ undef @ARGV;
+ } elsif ($arg eq "-justtestingifitworks") {
+ exit(0);
+ } else {
+ die "ERROR: unknown arg '$arg'\n";
+ }
+}
+if (!defined($genome) || !defined($genomedir)) {
+ print STDERR "usage: $0 -genome g.fasta -genomedir /some/path [args]\n";
+ print STDERR " -genome g.fasta the genome to map to\n";
+ print STDERR " -genomedir d the directory to save the configuration in\n";
+ print STDERR "\n";
+ print STDERR " -mersize m use m-mers (default 20)\n";
+ print STDERR " -merskip s skip s m-mers between mers (default 0, use all mers)\n";
+ print STDERR " -memory M use M MB memory for the search processes (default 1000MB)\n";
+ print STDERR " -segments S use S search processes (default, based on memory size)\n";
+ print STDERR " -sge compute the configuration on the grid; args are passed to qsub\n";
+ print STDERR " -sgename sge job name (default 'EMconfig')\n";
+ print STDERR " -local compute the configuration right now (the default)\n";
+ print STDERR "\n";
+ print STDERR " This precomputes search tables for ESTmapper.\n";
+ print STDERR " Both -genome and -genomedir must be specified.\n";
+ print STDERR " One of -memory and -segments should be specified.\n";
+ print STDERR "\n";
+ print STDERR "Example:\n";
+ print STDERR " configureESTmapper.pl -genome B35LC.fasta -genomedir B35LC -memory 900 -sge \"-pe thread 2\"\n";
+ print STDERR "\n";
+ exit(1);
+}
+
+$genome = "$ENV{'PWD'}/$genome" if ($genome !~ m!^/!);
+$genomedir = "$ENV{'PWD'}/$genomedir" if ($genomedir !~ m!^/!);
+
+system("mkdir -p $genomedir") if (! -d $genomedir);
+
+if ($genome !~ m/^\//) {
+ my $cwd = `pwd`;
+ chomp $cwd;
+ $genome = "$cwd/$genome";
+}
+
+die "Can't find genome '$genome'\n" if (! -e $genome);
+die "Can't find output directory '$genomedir'\n" if (! -d $genomedir);
+
+print STDERR "Configuring ESTmapper:\n";
+print STDERR " merSize $mersize\n";
+print STDERR " merSkip $merskip\n";
+print STDERR " ${memory}MB\n" if (defined($memory));
+print STDERR " $segments segments\n" if (defined($segments));
+
+symlink "${genome}", "$genomedir/genome.fasta" if ((! -f "$genomedir/genome.fasta"));
+
+print STDERR "configureESTmapper-- Initializing positionDB creation.\n";
+
+if (! -e "$genomedir/genome.seqStore") {
+ if (runCommand("$leaff -f $genomedir/genome.fasta --seqstore $genomedir/genome.seqStore > $genomedir/genome.seqStore.out 2>&1")) {
+ unlink "$genomedir/genome.seqStore";
+ die "Failed.\n";
+ }
+}
+
+my $acgtInFile = 0;
+my $acgtPerSegment = 0;
+my $segmentOverlap = 10000000;
+
+open(F, "< $genomedir/genome.seqStore.out") or die;
+while (<F>) {
+ if (m/\s+(\d+)\s+ACGT\s+letters/) {
+ $acgtInFile = $1;
+ }
+}
+close(F);
+
+print STDERR "Found $acgtInFile ACGT in the input.\n";
+die "No ACGT found?\n" if ($acgtInFile <= 0);
+
+# XXX: Magic Number! 12 bytes per base!
+
+if ($memory > 0) {
+ $acgtPerSegment = int($memory / 12 * 1000000) + 1;
+ print STDERR "configureESTmapper-- packing to preserve ${memory}MB memory limit ($acgtPerSegment mers per segment)\n";
+}
+
+if ($segments > 0) {
+ $acgtPerSegment = int($acgtInFile / $segments + $segmentOverlap) + 1;
+ print STDERR "configureESTmapper-- packing to preserve $segments processor limit ($acgtPerSegment mers per segment)\n";
+}
+
+$memory = int($acgtPerSegment * 12 / 1000000);
+
+open(F, "> $genomedir/memoryLimit") or die "Can't write $genomedir/memoryLimit\n";
+print F "$memory\n";
+close(F);
+
+
+my $merBeg = 0;
+my $merEnd = 0;
+my $segId = "000";
+
+open(F, "> $genomedir/segments");
+open(S, "> $genomedir/create.dat");
+while ($merBeg < $acgtInFile) {
+ $merEnd = $merBeg + $acgtPerSegment;
+
+ print F "$segId\n";
+ print S "$segId $merBeg $merEnd\n";
+
+ $merBeg += $acgtPerSegment - $segmentOverlap;
+ $segId++;
+}
+close(F);
+close(S);
+
+print STDERR "configureESTmapper-- Created $segId groups with maximum memory requirement of ${memory}MB.\n";
+
+die "Created no groups?\n" if ($segId eq "000");
+
+
+# Configure meryl
+#
+if (! -e "$genomedir/genome.merylArgs") {
+ my $cmd;
+ $cmd = "$meryl";
+ $cmd .= " -B -L 5 -f -m $mersize -segments $segId -configbatch";
+ $cmd .= " -s $genomedir/genome.seqStore";
+ $cmd .= " -o $genomedir/genome";
+ $cmd .= " > $genomedir/meryl.config.out 2>&1";
+ if (runCommand($cmd)) {
+ die "Failed.\n";
+ }
+}
+
+
+# Create the script that builds the positionDB's and meryl partitions
+#
+# If there is only one segment ($segId == "000") then meryl doesn't
+# use the batch mechanism; the meryl in create.sh writes the final
+# output.
+
+open(F, "> $genomedir/create.sh");
+print F "#!/bin/sh\n";
+print F "\n";
+print F "jobid=\$SGE_TASK_ID\n";
+print F "if [ x\$jobid = x -o x\$jobid = xundefined ]; then\n";
+print F " jobid=\$1\n";
+print F "fi\n";
+print F "if [ x\$jobid = x ]; then\n";
+print F " echo Error: I need SGE_TASK_ID set, or a job index on the command line.\n";
+print F " exit 1\n";
+print F "fi\n";
+print F "jobp=`cat $genomedir/create.dat | head -n \$jobid | tail -n 1`\n";
+print F "\n";
+print F "seg=`echo \$jobp | awk '{ print \$1 }'`\n";
+print F "beg=`echo \$jobp | awk '{ print \$2 }'`\n";
+print F "end=`echo \$jobp | awk '{ print \$3 }'`\n";
+print F "\n";
+print F "if [ ! -e \"$genomedir/seg\$seg.posDB\" ] ; then\n";
+print F " $posdb \\\n";
+print F " -mersize $mersize \\\n";
+print F " -merbegin \$beg \\\n";
+print F " -merend \$end \\\n";
+print F " -sequence \"$genomedir/genome.seqStore\" \\\n";
+print F " -output \"$genomedir/seg\$seg.building.posDB\" \\\n";
+print F " > \"$genomedir/seg\$seg.building.posDB.err\" 2>&1 \\\n";
+print F " && \\\n";
+print F " rm -f \"$genomedir/seg\$seg.building.posDB.err\" \\\n";
+print F " && \\\n";
+print F " mv \"$genomedir/seg\$seg.building.posDB\" \\\n";
+print F " \"$genomedir/seg\$seg.posDB\"\n";
+print F "fi\n";
+print F "\n";
+print F "bat=`expr \$jobid - 1`\n";
+print F "\n";
+print F "if [ ! -e \"$genomedir/genome.batch\$bat.mcdat\" -o ! -e \"$genomedir/genome.mcdat\" ] ; then\n";
+print F " $meryl \\\n";
+print F " -countbatch \$bat \\\n";
+print F " -o \"$genomedir/genome\" \\\n";
+print F " || \\\n";
+print F " rm -f \"$genomedir/genome.batch\$bat.mcidx\" \\\n";
+print F " \"$genomedir/genome.batch\$bat.mcdat\" \\\n";
+print F " \"$genomedir/genome.mcdat\" \\\n";
+print F " \"$genomedir/genome.mcdat\"\n";
+print F "fi\n";
+close(F);
+
+
+# Create the script that merges meryl outputs
+#
+open(F, "> $genomedir/meryl.sh");
+print F "#!/bin/sh\n";
+print F "\n";
+print F "if [ ! -e \"$genomedir/genome.mcidx\" ] ; then\n";
+print F " $meryl \\\n";
+print F " -mergebatch \\\n";
+print F " -o \"$genomedir/genome\" \\\n";
+print F " || \\\n";
+print F " rm -f \"$genomedir/genome.mcidx\" \\\n";
+print F " \"$genomedir/genome.mcdat\"\n";
+print F "fi\n";
+print F "\n";
+print F "if [ ! -e \"$genomedir/frequentMers-ge1000.fasta\" ] ; then\n";
+print F " $meryl \\\n";
+print F " -Dt -n 1000 \\\n";
+print F " -s \"$genomedir/genome\" \\\n";
+print F " > \"$genomedir/frequentMers-ge1000.fasta\" \\\n";
+print F " || \\\n";
+print F " rm -f \"$genomedir/frequentMers-ge1000.fasta\"\n";
+print F "fi\n";
+close(F);
+
+
+
+########################################
+#
+# run the jobs.
+#
+if ($local) {
+ my $seg = "000";
+
+ while ($seg ne $segId) {
+ # Copy $seg (a string) into $s (an integer).
+ my $s = int($seg);
+
+ print STDERR "Creating $seg out of $segId\n";
+
+ if ((! -e "$genomedir/seg$seg.posDB") || (! -e "$genomedir/genome.batch$s.mcdat")) {
+ $s++;
+ runCommand("/bin/sh $genomedir/create.sh $s") and die "Segment $seg failed.\n";
+ }
+
+ $seg++;
+ $seg = substr("000$seg", -3);
+ }
+ runCommand("/bin/sh $genomedir/meryl.sh") and die "Meryl failed.\n";
+} elsif ($sge) {
+
+ # Check if we need to submit pieces of the array, or if we can submit the whole thing.
+ #
+ my @ap;
+ my $wholeThing = 0;
+
+ system("mkdir $genomedir/sgeout") if (! -d "$genomedir/sgeout");
+
+ my $sgebuildname = "$sgename." . time();
+
+ my $seg = "000";
+ while ($seg ne $segId) {
+ if (-e "$genomedir/seg$seg.posDB") {
+ #print STDERR "Segment $seg finished successfully!\n";
+ } else {
+ #print STDERR "Segment $seg failed.\n";
+ $ap[$seg] = 1;
+ $wholeThing++;
+ }
+
+ $seg++;
+ $seg = substr("000$seg", -3);
+ }
+
+ if ($wholeThing == $seg) {
+ # Yippee! Submit all at once!
+ #
+ if (runCommand("qsub -cwd -j y -o $genomedir/sgeout/seg\\\$TASK_ID.out -t 1-$segId $sge -N $sgebuildname $genomedir/create.sh")) {
+ die "SGE submission failed?\n";
+ }
+ } elsif ($wholeThing > 0) {
+ # Dang, we need to submit individually....or we can take five
+ # minutes and figure out ranges to submit.
+ #
+ my $st;
+ my $ed;
+ my $it = 0;
+
+ # +2 so that we run off the end -- ensuring that we submit
+ # even the last batch of jobs.
+
+ while ($it < $segId + 2) {
+ if (!defined($st) && ($ap[$it] == 1)) {
+ # SGE wants to start at 1, we start at 0.
+ $st = $it + 1;
+ }
+ if (defined($st) && !defined($ed) && ($ap[$it] == 0)) {
+ # SGE wants to start at 1, we start at 0.
+ $ed = $it;
+ }
+ if (defined($st) && defined($ed)) {
+ #print STDERR "submit $st - $ed\n";
+ if (runCommand("qsub -cwd -j y -o $genomedir/sgeout/seg\\\$TASK_ID.out -t $st-$ed $sge -N $sgebuildname $genomedir/create.sh")) {
+ die "SGE submission failed?\n";
+ }
+ undef $st;
+ undef $ed;
+ }
+
+ $it++;
+ }
+ } else {
+ print STDERR "All segments computed successfully!\n";
+ }
+ if (runCommand("qsub -cwd -j y -o $genomedir/sgeout/meryl.out $sge -hold_jid $sgebuildname -N $sgename $genomedir/meryl.sh")) {
+ die "SGE submission failed?\n";
+ }
+} else {
+ die "HELP! I don't know how to run jobs!\n";
+}
+
diff --git a/ESTmapper/mergeCounts.C b/ESTmapper/mergeCounts.C
new file mode 100644
index 0000000..459133a
--- /dev/null
+++ b/ESTmapper/mergeCounts.C
@@ -0,0 +1,52 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+int
+main(int argc, char **argv) {
+
+ if (argc == 1) {
+ fprintf(stderr, "usage: %s <count-1> <count-2> <....>\n", argv[0]);
+ fprintf(stderr, " This is part of the ESTmapper; you don't want to run it by hand.\n");
+ exit(1);
+ }
+
+ int numFiles = argc-1;
+
+ FILE **Fs = new FILE * [numFiles];
+ for (int i=1; i<argc; i++) {
+ errno = 0;
+ Fs[i-1] = fopen(argv[i], "r");
+ if (errno) {
+ fprintf(stderr, "%s: ERROR: couldn't open %s: %s\n", argv[0], argv[i], strerror(errno));
+ exit(1);
+ }
+ }
+
+ char buf[256];
+ int eof = 0;
+
+ while (eof == 0) {
+ int count = 0;
+ for (int i=0; i<numFiles; i++) {
+ fgets(buf, 256, Fs[i]);
+ if (feof(Fs[i])) {
+ eof++;
+ } else {
+ count += atoi(buf);
+ }
+ }
+ if (eof == 0)
+ fprintf(stdout, "%d\n", count);
+ }
+
+ if (eof != numFiles) {
+ fprintf(stderr, "%s: ERROR: Short read on a count file.\n", argv[0]);
+ exit(1);
+ }
+
+ // Let the OS cleanup for us...
+
+ exit(0);
+}
diff --git a/ESTmapper/runConcurrently.pl b/ESTmapper/runConcurrently.pl
new file mode 100644
index 0000000..ef22e7b
--- /dev/null
+++ b/ESTmapper/runConcurrently.pl
@@ -0,0 +1,16 @@
+#!/usr/local/bin/perl
+
+use strict;
+
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use scheduler;
+
+&scheduler::schedulerSetNumberOfProcesses(4);
+&scheduler::schedulerSetNumberOfProcesses($ARGV[0]) if (scalar @ARGV > 0);
+
+while (<STDIN>) {
+ &scheduler::schedulerSubmit($_);
+}
+
+&scheduler::schedulerFinish();
diff --git a/ESTmapper/scheduler.pm b/ESTmapper/scheduler.pm
new file mode 100644
index 0000000..cfdf39d
--- /dev/null
+++ b/ESTmapper/scheduler.pm
@@ -0,0 +1,103 @@
+#!/usr/local/bin/perl
+#
+# Functions for running multiple processes at the same time.
+#
+
+package scheduler;
+
+use strict;
+use POSIX "sys_wait_h";
+
+# Called by "use scheduler;"
+sub import () {
+}
+
+my $numberOfProcesses = 0;
+my @processQueue = ();
+
+sub schedulerSetNumberOfProcesses {
+ $numberOfProcesses = shift @_;
+}
+
+sub schedulerSubmit {
+ chomp @_;
+ push @processQueue, @_;
+}
+
+sub forkProcess {
+ my $process = shift @_;
+ my $pid;
+
+ # From Programming Perl, page 167
+ FORK:
+ if ($pid = fork) {
+ return($pid); # Parent, returns child id
+ } elsif (defined $pid) {
+ exec($process); # Child, runs the process
+ } elsif ($! =~ /No more processes/) {
+ sleep 1; # EAGIN, supposedly a recoverable fork error
+ redo FORK;
+ } else {
+ die "Can't fork: $!\n";
+ }
+
+ die "scheduler::forkProcess()-- Shouldn't be here.\n";
+}
+
+sub schedulerFinish {
+ my @processesRunning;
+ my @newProcesses;
+ my $remain = scalar(@processQueue);
+
+ my $t = localtime();
+ my $d = time();
+
+ print STDERR "----------------------------------------START CONCURRENT $t\n";
+
+ while ($remain > 0) {
+
+ # Reap any processes that have finished
+
+ undef @newProcesses;
+ foreach my $i (@processesRunning) {
+ if (waitpid($i, &WNOHANG) <= 0) {
+ push @newProcesses, $i;
+ }
+ }
+ undef @processesRunning;
+ @processesRunning = @newProcesses;
+
+ # Run processes in any available slots
+
+ while ((scalar(@processesRunning) < $numberOfProcesses) &&
+ (scalar(@processQueue) > 0)) {
+ my $process = shift @processQueue;
+ print STDERR "$process\n";
+ push @processesRunning, forkProcess($process);
+ }
+
+ $remain = scalar(@processQueue);
+
+ # If still stuff out there, wait for something to finish.
+
+ if ($remain > 0) {
+ my $child = waitpid -1, 0;
+
+ undef @newProcesses;
+ foreach my $i (@processesRunning) {
+ push @newProcesses, $i if ($child != $i);
+ }
+ undef @processesRunning;
+ @processesRunning = @newProcesses;
+ }
+ }
+
+ while (scalar(@processesRunning) > 0) {
+ waitpid(shift @processesRunning, 0);
+ }
+
+ $t = localtime();
+ print STDERR "----------------------------------------END CONCURRENT $t (", time() - $d, " seconds)\n";
+}
+
+1;
diff --git a/ESTmapper/terminate.C b/ESTmapper/terminate.C
new file mode 100644
index 0000000..9436452
--- /dev/null
+++ b/ESTmapper/terminate.C
@@ -0,0 +1,160 @@
+#include "util++.H"
+#include "bio++.H"
+#include "sim4.H"
+
+#include "seqCache.H"
+
+// Terminates an ESTmapper run.
+//
+// Splits a fasta file into multiple fasta files based on the first
+// occurrence of the iid. So, if the iid is in polishes and
+// list-of-iid, the sequence is written to fasta1. If the iid isn't
+// in the input (polishes or list-of-iid), put it into fasta3.
+// Any number of -p and -i can be specified.
+//
+// -P polishes fasta1
+// -I list-of-iid fasta2
+// -O fasta3
+// -i input.fasta
+//
+// -P polishes MUST be sorted by cDNA iid. Relatively easy to fix this,
+// just read all the polishes when building an iidReaderWriter, storing the
+// iid's we see into an array.
+
+class iidReaderWriter {
+public:
+ iidReaderWriter(char *infile, char *otfile, bool ispolishes) {
+ isPolishes = ispolishes;
+ inPolishes = 0L;
+ inFile = 0L;
+
+ if (isPolishes) {
+ inPolishes = new sim4polishReader(infile);
+ } else {
+ errno = 0;
+ inFile = fopen(infile, "r");
+ if (errno)
+ fprintf(stderr, "iidReaderWriter-- can't open '%s': %s\n", infile, strerror(errno)), exit(1);
+ }
+
+ errno = 0;
+ otFile = fopen(otfile, "w");
+ if (errno)
+ fprintf(stderr, "iidReaderWriter-- can't open '%s': %s\n", otfile, strerror(errno)), exit(1);
+
+ iids = 0L;
+ };
+
+ ~iidReaderWriter() {
+
+ delete [] iids;
+
+ if (isPolishes)
+ delete inPolishes;
+ else
+ fclose(inFile);
+
+ fclose(otFile);
+ };
+
+ bool thisIID(uint32 targetiid) {
+ return(iids[targetiid]);
+ };
+
+ void writeSequence(seqInCore *S) {
+ fprintf(otFile, ">%s\n%s\n", S->header(), S->sequence());
+ };
+
+ void load(uint32 maxiid) {
+ iids = new bool [maxiid];
+
+ for (uint32 i=0; i<maxiid; i++)
+ iids[i] = false;
+
+ if (isPolishes) {
+ sim4polish *p = inPolishes->nextAlignment();
+ while (p) {
+ iids[p->_estID] = true;
+ delete p;
+ p = inPolishes->nextAlignment();
+ }
+ } else {
+ fscanf(inFile, uint32FMT, &iid);
+ while (!feof(inFile)) {
+ iids[iid] = true;
+ fscanf(inFile, uint32FMT, &iid);
+ }
+ }
+ };
+
+private:
+ bool isPolishes;
+ sim4polishReader *inPolishes;
+ FILE *inFile;
+ FILE *otFile;
+ uint32 iid;
+ bool *iids;
+};
+
+
+int
+main(int argc, char **argv) {
+ uint32 iidRWlen = 0;
+ uint32 iidRWmax = 128;
+ iidReaderWriter **iidRW = new iidReaderWriter* [iidRWmax];
+
+ FILE *defaultOut = 0L;
+
+ seqCache *F = 0L;
+ seqInCore *S = 0L;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-P") == 0) {
+ iidRW[iidRWlen++] = new iidReaderWriter(argv[arg+1], argv[arg+2], true);
+ arg+=2;
+ } else if (strcmp(argv[arg], "-I") == 0) {
+ iidRW[iidRWlen++] = new iidReaderWriter(argv[arg+1], argv[arg+2], false);
+ arg+=2;
+ } else if (strcmp(argv[arg], "-O") == 0) {
+ errno = 0;
+ defaultOut = fopen(argv[++arg], "w");
+ if (errno)
+ fprintf(stderr, "Can't open '%s': %s\n", argv[arg], strerror(errno)), exit(1);
+ } else if (strcmp(argv[arg], "-i") == 0) {
+ F = new seqCache(argv[++arg]);
+ } else {
+ fprintf(stderr, "ESTmapper utility function -- not for human use.\n");
+ exit(1);
+ }
+ arg++;
+ }
+ if ((iidRWlen == 0) || (defaultOut == 0L) || (F == 0L)) {
+ fprintf(stderr, "spec error.\n");
+ exit(1);
+ }
+
+ for (uint32 i=0; i<iidRWlen; i++)
+ iidRW[i]->load(F->getNumberOfSequences());
+
+ for (uint32 sid=0; ((S = F->getSequenceInCore(sid)) != 0L); sid++) {
+ bool found = false;
+ uint32 iid = S->getIID();
+
+ for (uint32 i=0; i<iidRWlen; i++) {
+ if (iidRW[i]->thisIID(iid)) {
+ found = true;
+ iidRW[i]->writeSequence(S);
+ break;
+ }
+ }
+
+ if (found == false)
+ fprintf(defaultOut, "%s\n%s\n", S->header(), S->sequence());
+
+ delete S;
+ }
+
+ return(0);
+}
+
diff --git a/Make.include b/Make.include
new file mode 100644
index 0000000..bfef44b
--- /dev/null
+++ b/Make.include
@@ -0,0 +1,30 @@
+# -*- makefile -*-
+
+LIBBIO/ :=$(realpath $/libbio/)/
+LIBSEQ/ :=$(realpath $/libseq/)/
+LIBKMER/ :=$(realpath $/libkmer/)/
+LIBMERYL/ :=$(realpath $/libmeryl/)/
+LIBSIM4/ :=$(realpath $/libsim4/)/
+LIBUTL/ :=$(realpath $/libutl/)/
+
+$(eval $(call Include,$/ESTmapper/))
+
+$(eval $(call Include,$/atac-driver/))
+$(eval $(call Include,$/seatac/))
+
+$(eval $(call Include,$/leaff/))
+$(eval $(call Include,$/meryl/))
+$(eval $(call Include,$/seagen/))
+$(eval $(call Include,$/sim4dbutils/))
+$(eval $(call Include,$/sim4db/))
+$(eval $(call Include,$/snapper/))
+$(eval $(call Include,$/tapper/))
+
+$(eval $(call Include,${LIBSIM4/}))
+$(eval $(call Include,${LIBKMER/}))
+$(eval $(call Include,${LIBMERYL/}))
+$(eval $(call Include,${LIBBIO/}))
+$(eval $(call Include,${LIBSEQ/}))
+$(eval $(call Include,${LIBUTL/}))
+
+$/.REAL-CLEAN := $/Make.compilers
diff --git a/Make.rules b/Make.rules
new file mode 100644
index 0000000..f904b13
--- /dev/null
+++ b/Make.rules
@@ -0,0 +1,316 @@
+# -*- makefile -*-
+
+# this might be useful for some future work if we want to make
+# actions more variable.
+define .Make-rule
+$1:
+ $2
+
+endef
+
+############################################################
+# useful functions for the install methods mentioned below
+############################################################
+# $(call .FUN-install-copy,file1 file2 file3,dir1/ dir2/)
+# copies whichever file exists into each of ${INSTALL/}dir
+# mkdir-ing as necessary.
+define .FUN-install-copy
+ @ files='$$(strip $1)'; dirs='$$(strip $2)'; \
+ if [ -n "$$$${files}" -a -n "$$$${dirs}" ] ; then \
+ for F in $$$${files} ; do \
+ if [ -f $$$${F} ] ; then \
+ for D in $$$${dirs} ; do \
+ Fout=$${INSTALL/}$$$${D}`basename $$$${F}` ; \
+ mkdir -p `dirname $$$${Fout}` && \
+ rm -f $$$${Fout} && cp -fp $$$${F} $$$${Fout} ; \
+ done ; \
+ fi ; \
+ done ; \
+ fi
+endef
+# because SOME PLATFORMS (like cygwin) use a special .exe extension
+# in executables, we have to do a little hack here. We assume that
+# Make.compilers has set a variable called .EXE
+define .FUN-install-copy-exe
+ @ files='$$(strip $1)'; dirs='$$(strip $2)'; \
+ if [ -n "$$$${files}" -a -n "$$$${dirs}" ] ; then \
+ for F in $$$${files} ; do \
+ if [ "${.EXE}" != "" -a -f $$$${F}${.EXE} ] ; then \
+ for D in $$$${dirs} ; do \
+ Fout=$${INSTALL/}$$$${D}`basename $$$${F}` ; \
+ mkdir -p `dirname $$$${Fout}` && \
+ rm -f $$$${Fout}${.EXE} && cp -fp $$$${F}${.EXE} $$$${Fout}${.EXE} ; \
+ done ; \
+ fi ; \
+ if [ -f $$$${F} ] ; then \
+ for D in $$$${dirs} ; do \
+ Fout=$${INSTALL/}$$$${D}`basename $$$${F}` ; \
+ mkdir -p `dirname $$$${Fout}` && \
+ rm -f $$$${Fout} && cp -fp $$$${F} $$$${Fout} ; \
+ done ; \
+ fi ; \
+ done ; \
+ fi
+endef
+# we do another cygwin inspired hack to deal with that fact that
+# .so shlibs need to be turned into .dll files.
+define .FUN-install-copy-shlib
+ @ files='$$(strip $1)'; dirs='$$(strip $2)'; \
+ if [ -n "$$$${files}" -a -n "$$$${dirs}" ] ; then \
+ for F in $$$${files} ; do \
+ if [ -f $$$${F} ] ; then \
+ for D in $$$${dirs} ; do \
+ Fout=$${INSTALL/}$$$${D}`basename $$$${F} .so`${.SO} ; \
+ mkdir -p `dirname $$$${Fout}` && \
+ rm -f $$$${Fout} && cp -fp $$$${F} $$$${Fout} ; \
+ done ; \
+ fi ; \
+ done ; \
+ fi
+endef
+# use this one for executable scripts with #! substitution
+# echo ":Mangling $$$${F} to $$$${Fout}:" ;
+define .FUN-install-copy-script
+ @ files='$$(strip $1)'; dirs='$$(strip $2)'; sheb='$$(strip $3)'; \
+ if [ -n "$$$${files}" -a -n "$$$${dirs}" ] ; then \
+ for F in $$$${files} ; do \
+ if [ -f $$$${F} ] ; then \
+ for D in $$$${dirs} ; do \
+ Fout=$${INSTALL/}$$$${D}`basename $$$${F}` ; \
+ mkdir -p `dirname $$$${Fout}` && \
+ rm -f $$$${Fout} && cp -fp $$$${F} $$$${Fout} ; \
+ chmod ugo+x $$$${Fout} && \
+ ${PERL} -npi \
+ -e"if(0==\$$$$i++){s|^#!.*|#!$$$${sheb}|}" $$$${Fout}; \
+ done ; \
+ fi ; \
+ done ; \
+ fi
+endef
+
+
+############################################################
+# C and C++ stuff
+############################################################
+# Building depends goals for C/CXX things
+
+# C_SRCS and CXX_SRCS are collected together and turned into
+# associated *.d dependency files.
+
+# WISHLIST: does not propagate failure to the parent make
+# for some reason. It really should. I think the pipe to
+# sed masks the exit code.
+__DEPGOALS__ += $$(patsubst %,%.d,$${${1:.=.C_SRCS}})
+ALL_C_DEPS :=$(foreach x,${//},$(patsubst %,%.d,${${x:.=.C_SRCS}}))
+${ALL_C_DEPS}:%.d:%
+ @ echo "making $@"
+ @ dir=`echo $< | sed -e's~[^/]*$$~~'`; \
+ ${CCDEP} ${CDEPFLAGS} ${CFLAGS} $< | \
+ sed -e"/:/s!^!$${dir}!" > $@
+
+__DEPGOALS__ += $$(patsubst %,%.d,$${${1:.=.CXX_SRCS}})
+ALL_CXX_DEPS :=$(foreach x,${//},$(patsubst %,%.d,${${x:.=.CXX_SRCS}}))
+${ALL_CXX_DEPS}:%.d:%
+ @ echo "making $@"
+ @ dir=`echo $< | sed -e's~[^/]*$$~~'`; \
+ ${CXXDEP} ${CXXDEPFLAGS} ${CXXFLAGS} $< | \
+ sed -e"/:/s!^!$${dir}!" > $@
+
+
+###### generic pattern rules for subgoals
+# don't want .o's getting deleted as intermediates
+.PRECIOUS: %${.O}
+.SUFFIXES: ${.O}
+
+%${.O}: %.c
+ ${-CC} ${CC} ${CFLAGS} ${CFLAGS_COMPILE} -o $@ -c $<
+
+%${.O}: %.cc
+ ${-CXX} ${CXX} ${CXXFLAGS} ${CXXFLAGS_COMPILE} -o $@ -c $<
+
+%${.O}: %.cpp
+ ${-CXX} ${CXX} ${CXXFLAGS} ${CXXFLAGS_COMPILE} -o $@ -c $<
+
+%${.O}: %.C
+ ${-CXX} ${CXX} ${CXXFLAGS} ${CXXFLAGS_COMPILE} -o $@ -c $<
+
+
+# linking commands use the $+ to get duplicated prereqs for linking
+## EXE targets
+ALL_C_EXES :=$(strip $(foreach x,${//},${${x:.=.C_EXES}}))
+${ALL_C_EXES}:
+ ${-CC} ${CLD} ${CLDFLAGS} -o $@ $+ ${CLIBS}
+__SUBGOALS__+=$${${1:.=.C_EXES}}
+
+ALL_CXX_EXES :=$(strip $(foreach x,${//},${${x:.=.CXX_EXES}}))
+${ALL_CXX_EXES}:
+ ${-CXX} ${CXXLD} ${CXXLDFLAGS} -o $@ $+ ${CXXLIBS}
+__SUBGOALS__+=$${${1:.=.CXX_EXES}}
+
+define .RULE-install-copy-C-CXX-EXES
+${1:.=.install-copy}: ${1:.=.install-copy-C-CXX-EXES}
+${1:.=.install-copy-C-CXX-EXES}:
+ $(call .FUN-install-copy-exe,$${${1:.=.C_EXES}} $${${1:.=.CXX_EXES}},bin/)
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-C-CXX-EXES,$x)))
+
+
+## LIB targets
+ALL_C_LIBS :=$(strip $(foreach x,${//},${${x:.=.C_LIBS}}))
+${ALL_C_LIBS}:
+ ${-CC} ${RM} $@ && ${AR} ${ARFLAGS} $@ $^
+__SUBGOALS__+=$${${1:.=.C_LIBS}}
+
+ALL_CXX_LIBS :=$(strip $(foreach x,${//},${${x:.=.CXX_LIBS}}))
+${ALL_CXX_LIBS}:
+ ${-CXX} ${RM} $@ && ${AR} ${ARFLAGS} $@ $^
+__SUBGOALS__+=$${${1:.=.CXX_LIBS}}
+
+## Shared targets
+# AIX has really weird shared lib building flags. Unfortunately, I could
+# not think of a way out of this hack.
+${_OS_}_SHLIB_FLAGS:=
+ALL_C_SHLIBS :=$(strip $(foreach x,${//},${${x:.=.C_SHLIBS}}))
+${ALL_C_SHLIBS}:
+ ${-CC} ${RM} $@ && ${CC} ${CLDFLAGS} ${SHLIB_FLAGS} ${${_OS_}_SHLIB_FLAGS} -o $@ $^ ${CLIBS}
+
+ALL_CXX_SHLIBS :=$(strip $(foreach x,${//},${${x:.=.CXX_SHLIBS}}))
+${ALL_CXX_SHLIBS}:
+ ${-CXX} ${RM} $@ && ${CXX} ${CXXLDFLAGS} ${SHLIB_FLAGS} ${${_OS_}_SHLIB_FLAGS} -o $@ $^ ${CXXLIBS}
+__SUBGOALS__+=$${${1:.=.C_SHLIBS}} $${${1:.=.CXX_SHLIBS}}
+
+define .RULE-install-copy-C-CXX-LIBS
+${1:.=.install-copy}: ${1:.=.install-copy-C-CXX-LIBS}
+${1:.=.install-copy-C-CXX-LIBS}:
+ $(call .FUN-install-copy,$${${1:.=.C_LIBS}} $${${1:.=.CXX_LIBS}}, lib/$${${1.=.LIB/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-C-CXX-LIBS,$x)))
+
+define .RULE-install-copy-C-CXX-SHLIBS
+${1:.=.install-copy}: ${1:.=.install-copy-C-CXX-SHLIBS}
+${1:.=.install-copy-C-CXX-SHLIBS}:
+ $(call .FUN-install-copy-shlib, $${${1:.=.C_SHLIBS}} $${${1:.=.CXX_SHLIBS}}, lib/$${${1.=.LIB/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-C-CXX-SHLIBS,$x)))
+
+
+
+define .RULE-install-copy-C-CXX-INCS
+${1:.=.install-copy}: ${1:.=.install-copy-C-CXX-INCS}
+${1:.=.install-copy-C-CXX-INCS}:
+ $(call .FUN-install-copy,$${${1:.=.C_INCS}} $${${1:.=.CXX_INCS}}, include/$${${1:.=.INCLUDE/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-C-CXX-INCS,$x)))
+
+
+
+############################################################
+# latex and some ps/pdf stuff
+############################################################
+
+# I'm not sure if I should mark these as precious or not
+#.PRECIOUS: %.pdf %.ps %.dvi %.aux %.bbl
+
+%.dvi: %.tex
+ ${-LATEX} cd `dirname $<` && ${LATEX} `basename $<` && ${LATEX} `basename $<`
+
+%.aux: %.tex
+ ${-LATEX} cd `dirname $<` && ${LATEX} `basename $<` && ${LATEX} `basename $<`
+
+%.bbl: %.aux
+ ${-LATEX} cd `dirname $<` && ${BIBTEX} `basename ${<:.aux=}`
+
+
+ALL_TEX_PS :=$(strip $(foreach x,${//},${${x:.=.TEX_PS}}))
+ALL_TEX_PDF :=$(strip $(foreach x,${//},${${x:.=.TEX_PDF}}))
+
+${ALL_TEX_PS}: %.ps: %.dvi
+ ${-LATEX} cd `dirname $<` && ${DVIPS} -o `basename $@` `basename $<`
+
+${ALL_TEX_PDF}: %.pdf: %.tex %.aux
+ ${-LATEX} cd `dirname $<` && ${PDFLATEX} `basename $<` && ${PDFLATEX} `basename $<`
+__SUBGOALS__+=$${${1:.=.TEX_PS}} $${${1:.=.TEX_PDF}}
+
+ # install rules
+define .RULE-install-copy-TEX_PSPDF
+${1:.=.install-copy}: ${1:.=.install-copy-TEX_PSPDF}
+${1:.=.install-copy-TEX_PSPDF}:
+# TEX_PS go to doc/
+ $(call .FUN-install-copy,$${${1:.=.TEX_PS}},doc/$${${1:.=.DOC/}})
+# TEX_PDF go to doc/
+ $(call .FUN-install-copy,$${${1:.=.TEX_PDF}},doc/$${${1:.=.DOC/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-TEX_PSPDF,$x)))
+
+
+
+############################################################
+# Python
+############################################################
+
+# python exes and libs are not subgoals.
+# Otherwise, they'd be deleted when we cleaned.
+# if we ever introduce some notion of 'file prep', beyond
+# adding the she-bang line, then we might want to do something
+# different here.
+#__SUBGOALS__+=$${${1:.=.PY_EXES}} $${${1:.=.PY_LIBS}}
+
+define .RULE-install-copy-PYTHON
+${1:.=.install-copy}: ${1:.=.install-copy-PYTHON}
+${1:.=.install-copy-PYTHON}:
+ $(call .FUN-install-copy-script, $${${1:.=.PY_EXES}}, bin/, ${PYTHON} ${PYTHON_FLAGS})
+ $(call .FUN-install-copy, $${${1:.=.PY_LIBS}}, lib/$${${1:.=.PY_LIB/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-PYTHON,$x)))
+
+
+############################################################
+# Perl
+############################################################
+# Same sort of spiel as python
+#__SUBGOALS__+=$${${1:.=.PERL_EXES}} $${${1:.=.PERL_LIBS}}
+
+define .RULE-install-copy-PERL
+${1:.=.install-copy}: ${1:.=.install-copy-PERL}
+${1:.=.install-copy-PERL}:
+ $(call .FUN-install-copy-script, $${${1:.=.PERL_EXES}}, bin/, ${PERL} ${PERL_FLAGS})
+ $(call .FUN-install-copy, $${${1:.=.PERL_LIBS}}, lib/$${${1:.=.PERL_LIB/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-PERL,$x)))
+
+############################################################
+# sh
+############################################################
+# Same sort of spiel as python
+#__SUBGOALS__+=$${${1:.=.SH_EXES}} $${${1:.=.SH_LIBS}}
+
+define .RULE-install-copy-SH
+${1:.=.install-copy}: ${1:.=.install-copy-SH}
+${1:.=.install-copy-SH}:
+ $(call .FUN-install-copy-script, $${${1:.=.SH_EXES}}, bin/, ${SH} ${SH_FLAGS})
+ $(call .FUN-install-copy, $${${1:.=.SH_LIBS}}, lib/$${${1:.=.SH_LIB/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-SH,$x)))
+
+
+############################################################
+# share -- a random catchall for scripts and whatnot that
+# we should have real rules for but we don't right now
+############################################################
+#__SUBGOALS__+=$${${1:.=.SHARES}}
+
+define .RULE-install-copy-SHARE
+${1:.=.install-copy}: ${1:.=.install-copy-SHARE}
+${1:.=.install-copy-SHARE}:
+ $(call .FUN-install-copy,$${${1:.=.SHARES}}, share/$${${1:.=.SHARE/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-SHARE,$x)))
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..822b810
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,194 @@
+# -*- makefile -*-
+
+### NOTE: make-3.80 has a bug in eval which causes it not to return
+### an empty target, but instead it sometimes returns very weird
+### stuff that makes make segv, I think.
+### You need to get it patched at:
+### http://savannah.gnu.org/bugs/?func=detailbug&bug_id=1517&group_id=71
+
+default: all
+
+MAKEFILE/ :=$(dir $(firstword $(MAKEFILE_LIST)))
+
+ifneq ($(shell ls -d ${MAKEFILE/}/Make.compilers 2> /dev/null), ${MAKEFILE/}/Make.compilers)
+ MAKECOMPILERSNOTHING := $(shell ${MAKEFILE/}configure.sh)
+endif
+
+include ${MAKEFILE/}Make.compilers
+
+
+##### non-recursive make magic
+# all directories which have been included
+// :=
+# current path (empty)
+/ :=
+
+# recursive directories to be filtered out of //
+# and handled differently
+//-RECURSIVE :=
+define MakeRecursive
+//-RECURSIVE :=$$/.
+endef
+
+## Include -hack
+# Include is wrapped in something which will push and pop /
+# properly while adding newly discovered directories to //
+# and keeping track of who is who's children (.SUBS).
+#
+# Each directory so included has its Make.include file included.
+# Those Make.include files can use $/ prepended to local names
+# to prevent name pollution, and define their own subtargets.
+#
+define Include
+ $(foreach x,$(strip ${1}),$(call Include_File,$x))
+endef
+
+define Include_File
+ ifeq ($(filter ${1}.,${WITHOUT_}),)
+ ifeq ($(wildcard ${1}Make.include),${1}Make.include)
+ $/.SUBS +=${1}.
+ // +=${1}.
+ ${1}.SUBS :=
+ / :=${1}
+ include ${1}Make.include
+ / :=$/
+ endif
+ endif
+
+endef
+
+##### System specific includes
+
+## WITHOUT
+# If the user specifies a WITHOUT, then those paths are not
+# followed.
+ifndef WITHOUT
+ WITHOUT:=
+endif
+WITHOUT_:=$(patsubst %,%.,$(strip ${WITHOUT}))
+
+## First Make.include inclusion
+# invoke the toplevel include file.
+# We use 'Include_File' instead of 'Include' since $/ is empty
+$(eval $(call Include_File,$/))
+
+#### Targets which have been declared RECURSIVE are removed
+# from // and processed separately.
+// :=$(filter-out ${//-RECURSIVE},${//})
+
+# //-RECURSIVE now holds the paths which are being done legacy style
+# and // holds the paths which are going to be part of the system.
+
+### Building subgoals in Make.rules
+# subgoals and depends are done with the deferred '=' not the ':='.
+# This is because we want dynamic scoping.
+__SUBGOALS__=
+__DEPGOALS__=
+
+-include ${MAKEFILE/}Make.rules
+
+# now we bring in the depends files as defined by the Includes
+# and the patterns in Make.rules
+$(eval DEPENDS:=$(foreach x,${//},$(call __DEPGOALS__,$x)))
+ifneq ($(strip ${DEPENDS}),)
+ # this conditional gets us a way out if things go way wrong
+ ifeq ($(filter %-clean,${MAKECMDGOALS}),)
+ -include ${DEPENDS}
+ endif
+endif
+
+### the standard make targets, applied to all subdirectory targets
+# We define the basic form of the all, clean, ... rules on a
+# per-path basis (the $/.all, $..clean, ... targets). This allows
+# selective targeting.
+## rules for each subtarget
+# current subtargets are :
+# all, ls, clean, real-clean, depends-clean, install
+# with all, clean, real-clean, install being required targets
+# of recursive makes.
+
+clean: ${//-RECURSIVE:.=.clean} $/.clean
+define .RULE-clean
+${1:.=.clean}: $${${1:.=.SUBS}:.=.clean}
+ ${RM} $${${1:.=.CLEAN}} ${__SUBGOALS__}
+ifneq ($(strip ${C_TMP_COMPILE} ${CXX_TMP_COMPILE}),)
+ (cd $1 && ${RM} -r ${C_TMP_COMPILE} ${CXX_TMP_COMPILE})
+endif
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-clean,$x)))
+
+
+depends-clean: $/.depends-clean
+${//-RECURSIVE:.=.depends-clean}:
+define .RULE-depends-clean
+${1:.=.depends-clean}: $${${1:.=.SUBS}:.=.depends-clean}
+ ${RM} ${1:.=Make.depends} ${__DEPGOALS__}
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-depends-clean,$x)))
+
+real-clean: ${//-RECURSIVE:.=.real-clean} $/.real-clean
+define .RULE-real-clean
+${1:.=.real-clean}: $${${1:.=.SUBS}:.=.real-clean}
+ ${RM} $${${1:.=.CLEAN}} ${__SUBGOALS__} ${1:.=Make.depends} ${__DEPGOALS__} $${${1:.=.REAL-CLEAN}}
+#ifneq ($(strip ${INSTALL/}),)
+# ${RM} -r ${INSTALL/}
+#endif
+ifneq ($(strip ${C_TMP_COMPILE} ${CXX_TMP_COMPILE}),)
+ (cd $1 && ${RM} -r ${C_TMP_COMPILE} ${CXX_TMP_COMPILE})
+endif
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-real-clean,$x)))
+
+all: ${//-RECURSIVE:.=.all} $/.all
+define .RULE-all
+${1:.=.all}: $${${1:.=.SUBS}:.=.all} ${__SUBGOALS__}
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-all,$x)))
+
+## INSTALL:
+# Define targets which give a basic recursive traversal to hang
+# things from for anything install related that I haven't thought of
+
+# this does the copy part of the install
+${//-RECURSIVE:.=.install-copy}:
+install-copy: ${//-RECURSIVE:.=.install-copy} $/.install-copy
+define .RULE-install-copy
+${1:.=.install-copy}: $${${1:.=.SUBS}:.=.install-copy}
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy,$x)))
+
+# define the actual install target as being a combination of the
+# all target plus the pre-/install-copy targets
+
+install: ${//-RECURSIVE:.=.install} $/.install
+define .RULE-install
+${1:.=.install}: ${1:.=.all} ${1:.=.install-copy}
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install,$x)))
+
+
+
+# a sort of debugging thing. Let's me check on which targets
+# actually didn't get made, or got partially made
+LSOPTS:=-l
+ls: $/.ls
+${//-RECURSIVE:.=.ls}:
+define .RULE-ls
+${1:.=.ls}: $${${1:.=.SUBS}:.=.ls}
+ @ files='$$(strip ${__SUBGOALS__})'; \
+ if [ -n "$$$${files}" ] ; then \
+ ls ${LSOPTS} $$$${files} ; \
+ fi ; exit 0;
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-ls,$x)))
+
+
+
+
diff --git a/Makefile.wiki b/Makefile.wiki
new file mode 100644
index 0000000..c8bab56
--- /dev/null
+++ b/Makefile.wiki
@@ -0,0 +1,1400 @@
+
+== Overview of the proposed new IR build system ==
+
+The proposed buld system for IR projects is defined by the set of
+files under <tt>cds/IR/build</tt>. These files consist of a <tt>Makefile</tt>
+and several ancillary files which provide platform specific and file
+type specific definitions and rules.
+
+The build itself is always directed through the one <tt>Makefile</tt> in
+the build directory. What I describe hereafter as '''the build''' is
+really <tt>gmake</tt> is invoked with this <tt>Makefile</tt>, either from
+within the <tt>cds/IR/build</tt> directory or referrenced explicitly
+with a <tt>-f</tt> option to <tt>gmake</tt>.
+
+When the build is started, the ancillary files are examined to
+establish definitions, rules, and to provide a naming utility. The
+build then looks for a file named <tt>Make.include</tt> in the directory
+of invocation. This <tt>Make.include</tt> files should contain build
+information for files pertinent to its directory and possibly the
+inclusions of other <tt>Make.include</tt> files in lower directories.
+It is only after the tree (and it better be a tree or something very
+bad will happen) of <tt>Make.include</tt>'s is read that any building
+starts, so that full dependency information is available before any
+action is taken.
+
+Automatic include files such as the <tt>*.d</tt> files which typically
+hold C and C++ dependencies are included, possibly after being rebuilt
+(if a build rule exists for them and if they do not exist or are out of
+date). Thus, the first actions of the build is typically the creation of
+automatically generated dependency information, with subsequent
+modifications of sources to minimally rebuild these files.
+
+The build properly proceeds with the creation of all targets which
+have been defined. Targets declared by placing them on the lists of
+variables which are defined to be targets by the build rules. It
+is possible to subgoal the building at a file or directory level of
+granularity.
+
+== Invoking the build system ==
+
+There are two different directories which are of importance in the
+build system. The first is the fixed build directory,
+<tt>cds/IR/build/</tt>, which holds the <tt>Makefile</tt>, ancillary
+files, and the installed files. The second directory of importance is
+the directory of invocation, the place where <tt>gmake</tt> is executed.
+These two directories coincide when one executes <tt>gmake</tt> from the
+build directory. Keep these things distinct because all inclusions
+start from and all actions take place in the directory of invocation,
+not the build directory.
+
+The first <tt>Make.include</tt> is read in the directory of invocation.
+If the convention has been followed of having <tt>Make.include</tt>
+files in every node of the source tree which refer only to
+subdirectories, then only those <tt>Make.include</tt> files at the
+directory of invocation and lower will be read into the build system.
+This means that automatic rebuilding of dependencies external to the
+directory of invocation will not be handled fully (files will not be
+updated, but they will be found if they exist). Thus, if one wishes
+to focus one's attention to a single subdirectory to build in, one
+should make the subdirectory the directory of invocation.
+
+<pre>
+$ cd that/subdir/
+$ gmake -f ${cvs}/cds/IR/build/Makefile all
+</pre>
+
+This will result in much less work that the build will have to do in
+order to figure out what actions to take, because it will examine fewer
+constraints.
+
+On the other hand, if one invokes from a higher directory, then
+any sibling dependencies of the target will be properly examined and
+necessary actions at that level will be taken.
+
+<pre>
+$ cd ${cvs}/cds/IR/build
+$ gmake that/subdir/.all
+</pre>
+
+This will result in all the subgoals in a subdirectory (and lower)
+being built, with any necessary updates of other directories being
+handled automatically. All rules which can be found are examined
+although only those actions necessary for the given subtarget(s) are
+taken.
+
+By default, the build system builds all things under the
+<tt>${cvs}/cds/IR</tt> tree which are currently checked out and which
+support the build system by providing <tt>Make.include</tt> files.
+
+<pre>
+$ cd ${cvs}/cds/IR/build
+$ gmake
+</pre>
+
+The targets currently supported by the build system are
+
+{| class="wikitable" border="1"
+|-
+| <tt>all</tt> || build all subgoals
+|-
+| <tt>clean</tt> || remove subgoals
+|-
+| <tt>depends-clean</tt> || remove any automatically generated dependency (<tt>*.d</tt>)files
+|-
+| <tt>real-clean</tt> || do both <tt>depends-clean</tt> and <tt>clean</tt>
+|-
+| <tt>install</tt> || do all and copy subgoals and other files to the install subdirectory of the build directory
+|-
+| <tt>ls</tt> || do '<tt>ls -l</tt>' of subgoals
+|-
+|}
+
+These targets are also supported in a subtarget specific form, such as
+<tt>that/subdir/.install</tt>.
+
+The compile options can be modified for debugging, profiling, or
+compile with GNU compilers, or modifying the installation directory.
+
+{| class="wikitable" border="1"
+|-
+| <tt>WITH_OPT=debug</tt> || compile with no optimization and maximum debugging
+|-
+| <tt>WITH_OPT=profile</tt> || compile for profiling with optimization and minimal debugging
+|-
+| <tt>WITH_GNU=1</tt> || compile with gnu compilers
+|-
+| <tt>WITH_THREADS=1</tt> || compile with threading enabled
+|-
+| <tt>INSTALL_TAG=<foo></tt> || append <tt>-<foo></tt> to the name of the install directory
+|-
+| <tt>MAKE_COMPILERS=<foo></tt> || use the file <tt><foo></tt> in place of <tt>Make.compilers</tt> (this option is of dubious value).
+|-
+| <tt>WITHOUT='dir1/ dir2/'</tt>|| cancel the inclusion of any <tt>Make.include</tt> files in the given directories (another dubious option).
+|-
+|}
+
+These options go on the <tt>gmake</tt> command line.
+
+<pre>
+$ gmake WITH_OPT=debug WITH_GNU=1 INSTALL_TAG=release all
+</pre>
+
+== Anatomy of a Make.include ==
+
+<tt>Make.include</tt> files define variables which hold names of files,
+and they are not
+necessarily sitting in the same place where the build is invoked. With
+these two considerations, there is defined, for each
+<tt>Make.include</tt> file a variable named by the single character
+'<tt>/</tt>' which holds the relative path of the current directory.
+Any file in the current directory can be referenced by prepending
+<tt>$/</tt> to it. The contents of <tt>/</tt> ought to be unique in the
+namespace of the build, so any variable defined as <tt>$/.MYVAR</tt>
+cannot conflict with any other variable in the build namespace (re-using
+the syntax of hidden files for variables may be a bad thing, but it
+has not caused problems yet).
+
+So, a simple <tt>Make.include</tt> might look like:
+
+<pre>
+$/.C_SRCS := $/hello.c
+$/.C_EXES := $/hello
+$/hello: $/hello.o
+$/.CLEAN := $/*.o
+</pre>
+
+Note, the use of the 'verb+:=+' assignment instead of the '<tt>=</tt>'
+assignment. The reason for this, is that <tt>gmake</tt> has two '''flavors'''
+of variable, the traditional one, which is lazily evaluated, defined by
+'<tt>=</tt>', and one which is imediately evaluated, defined by '<tt>:=</tt>'.
+If the RHS of the <tt>$/.C_SRCS</tt> assignment were to be lazily
+evaluated, then the <tt>$/</tt> component of the name would expand to
+whatever value <tt>/</tt> holds at the end of the whole traversal
+(which is an empty string if everything goes right). This would
+produce errors. I recommend the use of '<tt>:=</tt>' in just about
+every possible case, unless you are trying to be tricky on purpose.
+
+There are a number of special variables, used on a per-<tt>/</tt>
+basis by the <tt>Make.rules</tt> ancillary file. These variables can
+be assigned to in the <tt>Make.include</tt> files to specify the various
+types of files and the actions required to build them. The current
+list of variables is as follows:
+
+{| class="wikitable" border="1"
+|-
+| <tt>$/.C_SRCS</tt> || C sources which need to have their dependencies analyzed
+|-
+| <tt>$/.C_INCS</tt> || C header files
+|-
+| <tt>$/.C_LIBS</tt> || C library subgoals
+|-
+| <tt>$/.C_SHLIBS</tt> || C shared library subgoals
+|-
+| <tt>$/.C_EXES</tt> || C program subgoals
+|-
+| <tt>$/.CXX_<x></tt> || same as <tt>$/.C_<x></tt> but for C++
+|-
+| <tt>$/.TEX_PS</tt> || Postscript subgoals to be built from LaTeX files
+|-
+| <tt>$/.TEX_PDF</tt> || PDF subgoals to be built from LaTeX files
+|-
+| <tt>$/.SHARES</tt> || a catchall category for things which are just to be copied
+|-
+| <tt>$/.SH_LIBS</tt> || sh script libraries
+|-
+| <tt>$/.SH_EXES</tt> || sh script executables
+|-
+| <tt>$/.PERL_LIBS</tt> || Perl script libraries
+|-
+| <tt>$/.PERL_EXES</tt> || Perl script executables
+|-
+| <tt>$/.PY_LIBS</tt> || Python script libraries
+|-
+| <tt>$/.PY_EXES</tt> || Python script executables
+|-
+| <tt>$/.LIB/</tt> || subdirectory of <tt>lib/</tt> where <tt>$/.C_LIBS</tt> and <tt>$/.CXX_LIBS</tt> are installed
+|-
+| <tt>$/.INCLUDE/</tt> || subdirectory of <tt>include/</tt> where <tt>$/.C_INCS</tt> and <tt>$/.CXX_INCS</tt> are installed
+|-
+| <tt>$/.DOC/</tt> || subdirectory of <tt>doc/</tt> where <tt>$/.TEX_PS</tt> and <tt>$/.TEX_PDF</tt> are installed
+|-
+| <tt>$/.SHARE/</tt> || subdirectory of <tt>share/</tt> where <tt>$/.SHARES</tt> are installed
+|-
+| <tt>$/.SH_LIB/</tt> || subdirectory of <tt>scripts/</tt> where <tt>$/.SH_LIBS</tt> are installed
+|-
+| <tt>$/.PERL_LIB/</tt> || subdirectory of <tt>scripts/</tt> where <tt>$/.PERL_LIBS</tt> are installed
+|-
+| <tt>$/.PY_LIB/</tt> || subdirectory of <tt>scripts/</tt> where <tt>$/.PY_LIBS</tt> are installed
+|-
+| <tt>$/.CLEAN</tt> || files and patterns to be removed during a <tt>clean</tt>
+|-
+| <tt>$/.REAL-CLEAN</tt> || files and patterns to be removed during a <tt>real-clean</tt>
+|-
+|}
+
+Note: while the current build system removes all subgoals, intermediate
+files are not removed automatically. If C/C++ programs are being build
+then patterns like <tt>$/*.o</tt> should be put in the <tt>$/.CLEAN</tt> variable
+or they will not get removed. It is debatable whether this should be left
+up to each <tt>Make.include</tt> file to take care of on its own.
+
+There are a couple of routine tasks which are done in a fashion a
+little unusual to those accustomed to more traditional uses of make.
+One of these tasks is the specification of additional
+flags used when building the C/C++ programs. For C programs, this is
+done by adding to the <tt>CFLAGS</tt> variable in a target specific
+manner.
+
+<pre>
+$/myprogram.o $/myprogram.c.d: CFLAGS += -DTEST -I/usr/local/lib
+</pre>
+
+If you wish to have this take effect for all files defined in the
+current <tt>Make.include</tt> you can use a pattern rule.
+
+<pre>
+$/%.o $/%.d: CFLAGS +=-DTEST -I/usr/local/lib
+</pre>
+
+One '''gotcha''' is in the use of locally defined variables
+(like anything involving <tt>$/</tt>). It seems
+that any variable expansion on the target specific '+=' is
+delayed until after all rules have been traversed, at which point
+<tt>$/</tt> is very likely to have the wrong value. This does not
+happen with a target specific '<tt>:=</tt>' so it may be a bug in what is
+a fairly new <tt>gmake</tt> feature. The work-around invokes another
+fairly new <tt>gmake</tt> feature.
+
+<pre>
+$(eval $/%.o $/%.d: CFLAGS +=-I$/include)
+</pre>
+
+The second routine task is that of specifying external libraries to
+link to when building C/C++ executables. If the libraries are
+external to the whole build, then one would use the usual <tt>-L-l</tt>
+flags in a target specific variable modification.
+
+<pre>
+$/myprogram: CLDFLAGS+=-L/usr/local/foodir
+$/myprogram: CLIBS +=-lfoo
+</pre>
+
+One must again wrap any variable expressions which are likely to
+be overwritten by other includes
+with <tt>$(eval )</tt> to force imediate variable expansion. If the
+library is being built by the build system, using the <tt>-L-l</tt> flags would
+create a '''dependency leak''', as the build system would not know
+that the library must be updated before the link.
+To avoid creating this leak, one should do the more explicit dependency.
+
+<pre>
+$/myprogram: ${THELIBDIR/}libfoo.a
+</pre>
+
+Here no variable expansion needs to be forced, since dependency lines
+expand variables imediately. By making it a dependent, <tt>libfoo.a</tt>
+will appear in the series of arguments to the linker for <tt>$/myprogram</tt>
+and the leak avoided.
+
+=== Two examples ===
+
+Here is a walkthrough of the <tt>Make.include</tt> for
+<tt>AtacPipeline</tt>, which builds a variety of executables and
+libraries.
+
+<pre>
+$/.CXX_EXES :=$/heavyChains
+$/.CXX_SHLIBS :=$/localAlignerInterfacemodule.so $/halignmodule.so $/hellomodule.so
+</pre>
+
+One program and three shared libraries are subgoals to be created.
+
+<pre>
+$/.CXX_SRCS:=$/GF_ALN_dpaligner.cc $/GF_ALN_local.cc \
+ $/GF_ALN_overlap.cc $/GF_ALN_qvaligner.cc \
+ $/GF_ALN_loverlapper.cc $/GF_ALN_pieceOlap.cc \
+ $/halign.cc $/halignDriver.cc $/halignmodule.cc \
+ $/heavyChains.cc \
+ $/localAlignerInterface.cc $/localAlignerInterfacemodule.cc \
+ $/hellomodule.cc $/byemodule.cc $/holignmodule.cc
+</pre>
+
+The source files are declared.
+
+<pre>
+$/.CLEAN := $/*.o $/*.pyc
+</pre>
+
+Since this directory will build both C++ programs and python programs,
+intermediates for both must be <tt>clean</tt>-ed.
+
+<pre>
+$/.PY_EXES :=$/AtacDriver.py
+$/.PY_LIBS :=$(filter-out ${$/.PY_EXES},$(wildcard $/*.py))
+</pre>
+
+A python executable is declared and the python libraries are any file in
+this directory ending in '.py' which is not on the list of executables.
+
+<pre>
+$/.PY_LIB/ :=AtacPipeline/
+</pre>
+
+The python libraries are to be installed under <tt>scripts/AtacPipeline</tt>.
+
+<pre>
+$/heavyChains : $/heavyChains.o
+
+$/localAlignerInterfacemodule.so : \
+ $/localAlignerInterfacemodule.o $/localAlignerInterface.o \
+ $/GF_ALN_overlap.o $/GF_ALN_local.o \
+ $/GF_ALN_loverlapper.o $/GF_ALN_pieceOlap.o \
+ $/GF_ALN_dpaligner.o $/GF_ALN_qvaligner.o
+
+$/hellomodule.so: $/hellomodule.o
+
+$/halignmodule.so: $/halignmodule.o $/halign.o
+</pre>
+
+The linking dependencies for each of the targets is specified.
+
+<pre>
+$(eval $/%.d $/%.o: CXXFLAGS+=${PYINC})
+</pre>
+
+The shared libraries being built are actually python extensions, so they
+will be including python header files. The <tt>${PYINC}</tt> path is
+specified in the <tt>Make.compilers</tt> directory and is not expected to
+change (so the <tt>$(eval )</tt> wrapper is a bit paranoid, but harmless).
+
+There are some extra flags which are needed for building python
+extensions at the end of this file when on AIX, but they are very
+exceptional, and an explanation of there here is of little value.
+
+<pre>
+$(eval $/localAlignerInterfacemodule.so: AIX_SHLIB_FLAGS+=-einitlocalAlignerInterface -Wl,-bI:$/AIX_python-module-exports)
+$(eval $/halignmodule.so: AIX_SHLIB_FLAGS+=-einithalign -Wl,-bI:$/AIX_python-module-exports)
+$(eval $/hellomodule.so: AIX_SHLIB_FLAGS+=-einithello -Wl,-bI:$/AIX_python-module-exports)
+</pre>
+
+Our next example is the <tt>Make.include</tt> for
+<tt>MatchExtender</tt> which builds a series of C++ programs which
+depend on external libraries.
+
+<pre>
+FRAMEWORK/ :=$(call MakePath,$/../Framework/)
+RASCAL/ :=$(call MakePath,$/../../../RASCAL/src/)
+</pre>
+
+External paths are defined by the <tt>MakePath</tt> function. This
+function is explained later.
+
+<pre>
+$/.CXX_EXES := $/testFastaReader $/MatchExtender $/MismatchCounter
+</pre>
+
+Three C++ programs are to be built.
+
+<pre>
+ind_src := $/IndexedFastaReader.cc
+test_src := $/testFastaReader.cc
+mch_src := $/MEMatch.cc
+me_src := $/MatchExtenderAtac.cc $/MatchExtender.cc
+mc_src := $/MismatchCounterAtac.cc $/MismatchCounter.cc
+
+$/.CXX_SRCS := ${ind_src} ${test_src} ${mch_src} ${me_src}
+</pre>
+
+The sources are partitioned into four groups.
+
+<pre>
+$/.CLEAN :=$/*.o $/*~ $/core
+</pre>
+
+On a <tt>clean</tt> we remove object files, emacs backups, and any cores.
+
+<pre>
+$/testFastaReader: ${ind_src:.cc=.o} ${test_src:.cc=.o}
+$/MatchExtender: ${ind_src:.cc=.o} ${mch_src:.cc=.o} ${me_src:.cc=.o}
+$/MismatchCounter: ${ind_src:.cc=.o} ${mch_src:.cc=.o} ${mc_src:.cc=.o}
+</pre>
+
+Program dependencies are defined as combinations of the various
+groups defined above, with their '.cc' extensions turned to '.o'.
+
+<pre>
+${$/.CXX_EXES}: \
+ ${RASCAL/}seq/libRASCAL_seq.a ${RASCAL/}base/libRASCAL_base.a \
+ ${FRAMEWORK/}libATAC.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${RASCAL/}. -I${FRAMEWORK/}.)
+</pre>
+
+All programs must link to several external libraries and use their
+header files.
+
+=== The Include function ===
+
+The build system has wrapped the usual <tt>include</tt> syntax of
+<tt>gmake</tt> with a function called <tt>Include</tt> which can be
+invoked from within a <tt>Make.include</tt> file.
+
+<pre>
+$(eval $(call Include,$/subdir1/ $/subdir2/))
+</pre>
+
+Its effect is to check if there exists a <tt>Make.include</tt> file
+in each of its directory arguments, and if so, to traverse that file.
+The contents of those <tt>Make.include</tt> files are evaluated
+and added to the current build definitions. The variable <tt>/</tt> is pushed
+and popped appropriately.
+
+=== The MakePath function ===
+
+The build system supplies a function <tt>MakePath</tt> which is
+meant to be called in <tt>Make.include</tt> files to canonicalize pathnames.
+The problem it addresses is the one of <tt>gmake</tt>'s inability to recognize
+the sameness of expressions like <tt>src/../src/foo</tt> and <tt>src/foo</tt>.
+
+Suppose we had a set of files and directories as follows:
+
+<pre>
+X/
+ Make.include Y/ Z/
+X/Y/
+ Make.include y.c
+X/Z/
+ Make.include z.c
+</pre>
+
+Where we build a library <tt>liby.a</tt> in <tt>Y/</tt> which is needed to
+compile the program <tt>z</tt> in <tt>Z/</tt>. The contents of
+<tt>X/Make.include</tt> is
+
+<pre>
+$(eval $(call Include, $/Y/ $/Z/))
+</pre>
+
+and the contents of <tt>Y/Make.include</tt> is
+
+<pre>
+$/.C_SRCS :=$/y.c
+$/.C_LIBS :=$/liby.a
+$/liby.a: $/y.o
+</pre>
+
+Then a natural choice for <tt>Z/Make.include</tt> would be
+
+<pre>
+${Y/} :=$/../Y/
+$/.C_SRCS :=$/z.c
+$/.C_EXES :=$/z
+$/z: $/z.o ${Y/}liby.a
+</pre>
+
+If <tt>liby.a</tt> is already built by the time <tt>z</tt> is
+built, then there is no problem. If not then, and if the build
+is invoked in <tt>Z/</tt>, one will get some error about not
+knowing how to build <tt>../Y/liby.a</tt>, which is to be expected.
+However, if the build is invoked in <tt>X/</tt> then one gets
+a similar error about not knowing how to build <tt>X/../Y/liby.a</tt>.
+The build, invoked from <tt>X/</tt>, does know how to build <tt>Y/liby.a</tt>,
+but does not understand that <tt>X/../Y/liby.a</tt> is the same thing.
+
+The function
+
+<pre>
+$(call MakePath,P)
+</pre>
+
+takes a path <tt>P</tt> to an existing directory and returns the
+shortest (redundant dots and double dots collapsed) path to <tt>P</tt>
+relative to the directory of invocation, in a fashion consistent with
+the pathname conventions used elsewhere in the build system (trailing
+'/' and '.' referred to by an empty string).
+
+Thus, the right version of the <tt>Z/Make.include</tt> file is,
+
+<pre>
+${Y/} :=$(call MakePath,$/../Y/)
+$/.C_SRCS :=$/z.c
+$/.C_EXES :=$/z
+$/z: $/z.o ${Y/}liby.a
+</pre>
+
+<tt>MakePath</tt> will issue a warning if the directory sought
+is not found, and return an empty string.
+
+The current implementation of the <tt>MakePath</tt> function is kind of
+kludgey, involving a shell-call to either a C program or a PERL
+program. I have not found a better implementation yet for this
+functionality.
+
+=== Legacy builds ===
+
+It is inevitable, because some parts of the code tree
+came from external sources or are complicated legacy codes, that one
+wants to still be able to integrate the usual '''recursive make'''
+procedure for some directory which circumvents the build system
+and its dependency checking.
+
+Here is an example of a simple <tt>Make.include</tt> which does this.
+
+<pre>
+$(eval $(call MakeRecursive))
+
+$/md5lib/md5c.o: $/.all
+
+$/.all:
+ cd `dirname $@` && ${MAKE} all
+
+$/.real-clean $/.clean:
+ cd `dirname $@` && ${MAKE} clean
+
+$/.install:
+</pre>
+
+The first line calls a special build system function,
+<tt>MakeRecursive</tt> which declares that this <tt>Make.include</tt> file
+is opting out of the usual build system and will define its own
+subtargets. The next line announces a target being supplied,
+the <tt>$/md5lib/md5c.o</tt> object file. This is optional, but
+gives the build system some idea of how to order multiple recursive
+makes based on possible mutual dependencies. The next lines specify
+rules for subdirectory specific subtargets (<tt>all</tt>, <tt>clean</tt>,
+<tt>real-clean</tt>, <tt>install</tt>) all of which are mandatory for
+recursive <tt>Make.include</tt>'s. Each of these rules is just a
+recursive build invocation after changing into the appropriate
+directory, or an empty rule, signifying no action.
+
+
+== Anatomy of the Makefile ==
+
+Here we go line by line through the <tt>Makefile</tt> (CVS revision
+1.29) and discuss the function of every part.
+
+<pre>
+default: all
+</pre>
+
+First a default target is created. The first goal listed is always
+the default target. Typically people use <tt>all</tt> for this. Since
+we do not know what verb+all+ will mean until much later in the file,
+we can not define <tt>all</tt> yet.
+
+<pre>
+ifndef MAKEFILE/
+ MAKEFILE/ :=$(dir $(firstword $(MAKEFILE_LIST)))
+endif
+</pre>
+
+The auxiliiary files are looked for in the directory where the
+<tt>Makefile</tt> was found. We extract this information from the
+built-in variable <tt>MAKEFILE_LIST</tt>. The <tt>MAKEFILE/</tt> variable
+points to the build directory.
+
+<pre>
+ifdef MAKE_COMPILERS
+ include ${MAKE_COMPILERS}
+else
+ include ${MAKEFILE/}Make.compilers
+endif
+</pre>
+
+We load the <tt>Make.compilers</tt> file, which is more of a
+configuration file, since it contains definitions not just of the
+compilers but also of basic utilities and of locations of important
+libraries such as X11 and LAPACK. One design goal was to have
+all platform specifics captured by a single file so
+that porting to a new platform would require only the adjustment
+of this file. This file can be overridden by a user supplied
+<tt>MAKE_COMPILERS</tt> argument, though it is probably a mistake
+to use this feature as anything but a temporary device.
+
+<pre>
+include ${MAKEFILE/}Make.path
+</pre>
+
+The <tt>Make.path</tt> file supplies a crucial utility in canonicalizing
+directory names.
+
+We now begin the directory traversal part, where subdirectories are
+explored and build information is collected.
+
+<pre>
+// :=
+/ :=
+//-RECURSIVE :=
+define MakeRecursive
+//-RECURSIVE :=$$/.
+endef
+</pre>
+
+Three important variables are being initialized here. The variable
+<tt>//</tt> holds the list of all directories which have been traversed
+which have not opted out of the build system. The directories are
+kept in '''dotted''' form (i.e. <tt>.</tt>, <tt>subdir/.</tt>). The
+<tt>//-RECURSIVE</tt> variable holds those directories (in dotted form)
+which have been traversed and have opted out of the build system. The
+variable <tt>/</tt> is the current relative path variable, which is
+meant to be used by traversed <tt>Make.include</tt> files.
+
+<pre>
+define Include
+ $(foreach x,$(strip ${1}),$(call Include_File,$x))
+endef
+
+define Include_File
+ ifeq ($(filter ${1}.,${WITHOUT_}),)
+ ifeq ($(wildcard ${1}Make.include),${1}Make.include)
+ $/.SUBS +=${1}.
+ // +=${1}.
+ ${1}.SUBS :=
+ / :=${1}
+ include ${1}Make.include
+ / :=$/
+ endif
+ endif
+
+endef
+
+ifndef WITHOUT
+ WITHOUT:=
+endif
+WITHOUT_:=$(patsubst %,%.,$(strip ${WITHOUT}))
+</pre>
+
+The normal <tt>include</tt> syntax is wrapped in a function which will
+maintain <tt>/</tt> properly while adding newly traversed directories to
+<tt>//</tt> and keeping track of who is who's children (kept in
+<tt>$/.SUBS</tt>). Each directory
+is traversed if its <tt>Make.include</tt> file exists and is not on
+a set of special suppressed directories (contained in the <tt>${WITHOUT}</tt>). Traversed directories
+have their <tt>Make.include</tt> files included. Within those
+<tt>Make.include</tt> files, <tt>/</tt> will hold the relative path
+to the directory. The <tt>Include</tt> function is meant for external
+use, while the <tt>Include_File</tt> is a technicality and should not
+be employed except within this file.
+
+<pre>
+$(eval $(call Include_File,$/))
+</pre>
+
+We include the <tt>Make.include</tt> file which sits in the directory
+of invocation (as opposed to the one in the build directory). Since traversal
+starts in this directory, the only build information which
+will be considered is that from this directory and its descendants,
+allowing a user to build within a limited source directory, if they
+do not which to check lateral dependencies for some reason (e.g.
+efficiency).
+
+<pre>
+// :=$(filter-out ${//-RECURSIVE},${//})
+</pre>
+
+After traversal, <tt>//</tt> holds all directories which have been
+traversed. We now remove from it all those paths which have opted
+out. At this point, <tt>//</tt> holds those traversed directories which are
+considered to be properly participating in the build and
+<tt>//-RECURSIVE</tt> holds those which will be built in a more or less
+'''legacy''' fashion. At this point, <tt>/</tt> should be an
+empty string (even though it does appear below).
+
+A second design goal was the separation of the specification of
+build rules from the primary <tt>Makefile</tt> so that new file types
+and build commands could be added to the build system by appending
+them to <tt>Make.rules</tt>. Actions are dictated by file types.
+
+<pre>
+__SUBGOALS__=
+__DEPGOALS__=
+</pre>
+
+The <tt>__SUBGOALS__</tt> variable is intended to hold all those targets
+which must be made for the <tt>all</tt> target. The <tt>__DEPGOALS__</tt>
+holds patterns for automatic dependency files which are to be included.
+These variables will be
+dynamically scoped (the one exception we make to the usual static scoping).
+This allows for a variable capture which we exploit later. The
+<tt>__SUBGOALS__</tt> and <tt>__DEPGOALS__</tt> variables are
+appended to in the <tt>Make.rules</tt> file.
+
+<pre>
+-include ${MAKEFILE/}Make.rules
+</pre>
+
+If the <tt>Make.rules</tt> file exists in the directory of the
+<tt>Makefile</tt> then it is included. If it does not exist, the system will
+use the default rules built-in to <tt>make</tt>, which have a
+chance of working right (a snowball's chance in hell).
+
+<pre>
+$(eval DEPENDS:=$(foreach x,${//},$(call __DEPGOALS__,$x)))
+ifneq ($(strip ${DEPENDS}),)
+ ifeq ($(filter %-clean,${MAKECMDGOALS}),)
+ include ${DEPENDS}
+ endif
+endif
+</pre>
+
+The <tt>__DEPGOALS__</tt> pattern is evaluated on every directory and
+expanded into a set of files in the variable <tt>DEPENDS</tt>. Unless
+one of the command goals of the build contains the suffix <tt>-clean</tt>
+(<tt>real-clean</tt> or <tt>depends-clean</tt>, but not <tt>clean</tt>),
+these files will be included. The '''clean''' conditional exists to
+prevent certain kind of wedged conditions the build system could get
+in as well as allowing the clean targets to be processed without a
+building of any automatically created <tt>DEPENDS</tt> files.
+
+We now define the standard make targets, which are applied to all
+subdirectory targets. The basic target, <tt>TARG</tt> is also defined
+on a per-subdirectory basis with targets of the form <tt>$/.TARG</tt>
+with <tt>TARG</tt> being nearly an alias for <tt>.TARG</tt> (aside from
+<tt>//-RECURSIVE</tt> directories). This allows
+the user to selectively build only those subgoals which are in a
+single directory. Target <tt>TARG</tt> for <tt>//-RECURSIVE</tt> builds are
+done before the <tt>$/.TARG</tt> target. Building all legacy targets
+first seems like a good idea.
+
+<pre>
+clean: ${//-RECURSIVE:.=.clean} $/.clean
+define .RULE-clean
+${1:.=.clean}: $${${1:.=.SUBS}:.=.clean}
+ ${RM} $${${1:.=.CLEAN}} ${__SUBGOALS__}
+ (cd $1 && ${RM} -r ${C_TMP_COMPILE} ${CXX_TMP_COMPILE})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-clean,$x)))
+</pre>
+
+The <tt>clean</tt> target executes for recursive directories first
+and then for <tt>.clean</tt>. The <tt>$/.clean</tt> target for
+each subdirectory depends on the <tt>$/.clean</tt> target of its
+children and executes by removing those files or patterns which were listed
+in the <tt>$/.CLEAN</tt> variable of that directory, any subgoals of
+that directory, and any temporary compiler files which may have been
+created in that directory (e.g. <tt>so_locations/</tt>).
+
+<pre>
+depends-clean: $/.depends-clean
+${//-RECURSIVE:.=.depends-clean}:
+define .RULE-depends-clean
+${1:.=.depends-clean}: $${${1:.=.SUBS}:.=.depends-clean}
+ ${RM} ${1:.=Make.depends} ${__DEPGOALS__}
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-depends-clean,$x)))
+</pre>
+
+Similar to <tt>clean</tt> only we remove only dependency files which
+may have been built to satisfy the <tt>include ${DEPENDS}</tt> line
+previous.
+
+<pre>
+real-clean: ${//-RECURSIVE:.=.real-clean} $/.real-clean
+define .RULE-real-clean
+${1:.=.real-clean}: $${${1:.=.SUBS}:.=.real-clean}
+ ${RM} $${${1:.=.CLEAN}} ${__SUBGOALS__}
+ (cd $1 && ${RM} -r ${C_TMP_COMPILE} ${CXX_TMP_COMPILE})
+ ${RM} ${1:.=Make.depends} ${__DEPGOALS__}
+ ${RM} $${${1:.=.REAL-CLEAN}}
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-real-clean,$x)))
+</pre>
+
+A combination of the previous two <tt>clean</tt> targets.
+
+<pre>
+all: ${//-RECURSIVE:.=.all} $/.all
+define .RULE-all
+${1:.=.all}: $${${1:.=.SUBS}:.=.all} ${__SUBGOALS__}
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-all,$x)))
+</pre>
+
+The <tt>all</tt> target depends on all subdirectory <tt>all</tt>'s
+and all subgoals for this directory.
+
+The last major section of the <tt>Makefile</tt> is the installer.
+Installation currently proceeds by depending on the subgoals
+and upon a copy of those built subgoals to a special directory,
+<tt>INSTALL/</tt> which is determined in the <tt>Make.compilers</tt>
+file. Because different directories may wish to do different
+kinds of pre and post installation actions, the <tt>.install</tt>
+targets have been written to provide a number of hooks. It is
+up to the <tt>Make.rules</tt> file to make use of those hooks.
+
+<pre>
+${//-RECURSIVE:.=.install-copy}:
+install-copy: ${//-RECURSIVE:.=.install-copy} $/.install-copy
+define .RULE-install-copy
+${1:.=.install-copy}: $${${1:.=.SUBS}:.=.install-copy}
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy,$x)))
+</pre>
+
+All <tt>.install</tt> targets have a <tt>.install-copy</tt> target defined
+which depends on the <tt>.install-copy</tt>'s of the children. A dummy
+target is defined for legacy builds to prevent certain kinds of
+build problems, but it is never normally invoked.
+
+<pre>
+install: ${//-RECURSIVE:.=.install} $/.install
+define .RULE-install
+${1:.=.install}: ${1:.=.all} ${1:.=.install-copy}
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install,$x)))
+</pre>
+
+An install in a directory is equivalent to doing an install in
+the legacy directories, a build in the current directory (and
+its children) and an install copy in the current directory
+(and its children).
+
+The benefit of this separation of tasks for install is that
+the <tt>install-copy</tt> phase of the build can be invoked as a
+separate target to selectively copy targets into the install
+directory, whcih might be needed in some special cases. One
+major disadvantage of this separation is that because <tt>install-copy</tt>
+does not depend on <tt>all</tt>, a multithreaded invocation of
+<tt>gmake</tt> (i.e. <tt>gmake -j4</tt>) is not guarranteed to perform
+<tt>install-copy</tt> after <tt>all</tt>.
+
+== Anatomy of the Make.rules ==
+
+The execution of commands other than cleaning commands is determined
+by the file <tt>Make.rules</tt> (cvs revision 1.25). This file defines
+file types and actions to be taken to rebuild files.
+
+<pre>
+define .FUN-install-copy
+ @ files='$$(strip $1)'; dirs='$$(strip $2)'; \
+ if [ -n "$$$${files}" -a -n "$$$${dirs}" ] ; then \
+ for F in $$$${files} ; do \
+ if [ -f $$$${F} ] ; then \
+ for D in $$$${dirs} ; do \
+ Fout=$${INSTALL/}$$$${D}`basename $$$${F}` ; \
+ echo ":Copying $$$${F} to $$$${Fout}:" ; \
+ mkdir -p `dirname $$$${Fout}` && \
+ rm -f $$$${Fout} && cp -fp $$$${F} $$$${Fout} ; \
+ done ; \
+ fi ; \
+ done ; \
+ fi
+endef
+define .FUN-install-copy-exe
+ @ files='$$(strip $1)'; dirs='$$(strip $2)'; \
+ if [ -n "$$$${files}" -a -n "$$$${dirs}" ] ; then \
+ for F in $$$${files} ; do \
+ if [ -f $$$${F}${.EXE} ] ; then \
+ for D in $$$${dirs} ; do \
+ Fout=$${INSTALL/}$$$${D}`basename $$$${F}` ; \
+ echo ":Copying $$$${F}${.EXE} to $$$${Fout}${.EXE}:" ; \
+ mkdir -p `dirname $$$${Fout}` && \
+ rm -f $$$${Fout}${.EXE} && cp -fp $$$${F}${.EXE} $$$${Fout}${.EXE} ; \
+ done ; \
+ fi ; \
+ done ; \
+ fi
+endef
+define .FUN-install-copy-script
+ @ files='$$(strip $1)'; dirs='$$(strip $2)'; sheb='$$(strip $3)'; \
+ if [ -n "$$$${files}" -a -n "$$$${dirs}" ] ; then \
+ for F in $$$${files} ; do \
+ if [ -f $$$${F} ] ; then \
+ for D in $$$${dirs} ; do \
+ Fout=$${INSTALL/}$$$${D}`basename $$$${F}` ; \
+ echo ":Mangling $$$${F} to $$$${Fout}:" ; \
+ mkdir -p `dirname $$$${Fout}` && \
+ rm -f $$$${Fout} && cp -fp $$$${F} $$$${Fout} ; \
+ chmod ugo+x $$$${Fout} && \
+ ${PERL} -npi \
+ -e"if(0==\$$$$i++){s|^#!.*|#! $$$${sheb}|}" $$$${Fout}; \
+ done ; \
+ fi ; \
+ done ; \
+ fi
+endef
+</pre>
+
+These are three similar helper functions.
+The first of these copies its first
+argument, <tt>files</tt>, into all of the directories specified in the
+second argument, <tt>dirs</tt> (which are assumed to be subdirs of
+<tt>INSTALL/</tt>). It checks for existence and tries to create
+directories as it needs. The second function is similar to the first
+but it is for executable binaries, which require a special suffix
+(e.g. <tt>.exe</tt>) on some platforms. The third function is similar to the
+first, but it also takes a third argument <tt>sheb</tt> which is the
+'''shebang''' line for a script. It replaces the shebang line of the
+contents of the <tt>sheb</tt> variable.
+
+The rest of the file is the set of rule blocks, each block dealing with
+a certain file type.
+
+The first section, which is the largest, is the one for C and C++.
+
+<pre>
+__DEPGOALS__ += $$(patsubst %,%.d,$${${1:.=.C_SRCS}})
+ALL_C_DEPS :=$(foreach x,${//},$(patsubst %,%.d,${${x:.=.C_SRCS}}))
+${ALL_C_DEPS}:%.d:%
+ @ echo "making $@"
+ dir=`echo $< | sed -e's~[^/]*$$~~'`; \
+ ${CCDEP} ${CDEPFLAGS} ${CFLAGS} $< | \
+ sed -e"/:/s!^!$${dir}!" > $@
+
+__DEPGOALS__ += $$(patsubst %,%.d,$${${1:.=.CXX_SRCS}})
+ALL_CXX_DEPS :=$(foreach x,${//},$(patsubst %,%.d,${${x:.=.CXX_SRCS}}))
+${ALL_CXX_DEPS}:%.d:%
+ @ echo "making $@"
+ dir=`echo $< | sed -e's~[^/]*$$~~'`; \
+ ${CXXDEP} ${CXXDEPFLAGS} ${CXXFLAGS} $< | \
+ sed -e"/:/s!^!$${dir}!" > $@
+</pre>
+
+This section specifies the compiler dependencies which must be
+detected. Dependency files are made for all source files
+(set to <tt>$/.C_SRCS</tt> and <tt>$/.CXX_SRCS</tt> presumably in
+the <tt>$/Make.include</tt> file). These
+names are added to the <tt>___DEPGOALS__</tt> to be included later
+in the <tt>Makefile</tt>. We also have the rule for constructing
+dependency files from source files.
+
+<pre>
+.PRECIOUS: %.o
+
+%.o: %.c
+ ${-CC} ${CC} ${CFLAGS} ${CFLAGS_COMPILE} -o $@ -c $<
+
+%.o: %.cc
+ ${-CXX} ${CXX} ${CXXFLAGS} ${CXXFLAGS_COMPILE} -o $@ -c $<
+
+%.o: %.cpp
+ ${-CXX} ${CXX} ${CXXFLAGS} ${CXXFLAGS_COMPILE} -o $@ -c $<
+
+%.o: %.C
+ ${-CXX} ${CXX} ${CXXFLAGS} ${CXXFLAGS_COMPILE} -o $@ -c $<
+
+</pre>
+
+Pattern-driven rules are specified for several kinds of
+object code builds.
+
+<pre>
+ALL_C_EXES :=$(strip $(foreach x,${//},${${x:.=.C_EXES}}))
+${ALL_C_EXES}:
+ ${-CC} ${CC} ${CLDFLAGS} -o $@ $+ ${CLIBS}
+__SUBGOALS__+=$${${1:.=.C_EXES}}
+
+ALL_CXX_EXES :=$(strip $(foreach x,${//},${${x:.=.CXX_EXES}}))
+${ALL_CXX_EXES}:
+ ${-CXX} ${CXX} ${CXXLDFLAGS} -o $@ $+ ${CXXLIBS}
+__SUBGOALS__+=$${${1:.=.CXX_EXES}}
+</pre>
+
+We add to the subgoals the executable programs <tt>$/.C_EXES</tt> and
+<tt>$/.CXX_EXES</tt>. They are constructed by a link command.
+
+<pre>
+define .RULE-install-copy-C-CXX-EXES
+${1:.=.install-copy}: ${1:.=.install-copy-C-CXX-EXES}
+${1:.=.install-copy-C-CXX-EXES}:
+ $(call .FUN-install-copy-exe,$${${1:.=.C_EXES}} $${${1:.=.CXX_EXES}},bin/)
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-C-CXX-EXES,$x)))
+</pre>
+
+We add to each <tt>.install-copy</tt> rule the action that executable
+binaries be copied to the subdirectory <tt>bin/</tt>.
+
+<pre>
+ALL_C_LIBS :=$(strip $(foreach x,${//},${${x:.=.C_LIBS}}))
+${ALL_C_LIBS}:
+ ${-CC} ${RM} $@ && ${AR} ${ARFLAGS} $@ $^
+__SUBGOALS__+=$${${1:.=.C_LIBS}}
+
+ALL_CXX_LIBS :=$(strip $(foreach x,${//},${${x:.=.CXX_LIBS}}))
+${ALL_CXX_LIBS}:
+ ${-CXX} ${RM} $@ && ${AR} ${ARFLAGS} $@ $^
+__SUBGOALS__+=$${${1:.=.CXX_LIBS}}
+
+${_OS_}_SHLIB_FLAGS:=
+ALL_C_SHLIBS :=$(strip $(foreach x,${//},${${x:.=.C_SHLIBS}}))
+${ALL_C_SHLIBS}:
+ ${-CC} ${RM} $@ && ${CC} ${CLDFLAGS} ${SHLIB_FLAGS} ${${_OS_}_SHLIB_FLAGS} -o $@ $^ ${CLIBS}
+
+ALL_CXX_SHLIBS :=$(strip $(foreach x,${//},${${x:.=.CXX_SHLIBS}}))
+${ALL_CXX_SHLIBS}:
+ ${-CXX} ${RM} $@ && ${CXX} ${CXXLDFLAGS} ${SHLIB_FLAGS} ${${_OS_}_SHLIB_FLAGS} -o $@ $^ ${CXXLIBS}
+__SUBGOALS__+=$${${1:.=.C_SHLIBS}} $${${1:.=.CXX_SHLIBS}}
+</pre>
+
+Additional C,C++ subgoals include libraries and shared libraries. It is
+unfortunate that AIX has a fairly different means of producing shared
+libraries than other operating systems. This is the only place in the
+rules where the <tt>_OS_</tt> variable (defined in <tt>Make.compilers</tt>)
+is a factor in determining the rule. If more situations
+like this arise, it may be necessary to redesign the interactions
+between <tt>Make.rules</tt> and <tt>Make.compilers</tt>.
+
+<pre>
+define .RULE-install-copy-C-CXX-LIBS
+${1:.=.install-copy}: ${1:.=.install-copy-C-CXX-LIBS}
+${1:.=.install-copy-C-CXX-LIBS}:
+ $(call .FUN-install-copy,$${${1:.=.C_LIBS}} $${${1:.=.CXX_LIBS}}, \
+ lib/$${${1.=.LIB/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-CXX-LIBS,$x)))
+
+define .RULE-install-copy-C-CXX-SHLIBS
+${1:.=.install-copy}: ${1:.=.install-copy-CXX-SHLIBS}
+${1:.=.install-copy-CXX-SHLIBS}:
+ $(call .FUN-install-copy,$${${1:.=.C_SHLIBS}} $${${1:.=.CXX_SHLIBS}}, \
+ lib/$${${1.=.LIB/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-CXX-SHLIBS,$x)))
+</pre>
+
+Libraries and shared libraries are copied to the subdirectory
+<tt>lib/$/.LIB/</tt>, i.e. to <tt>lib/</tt> or some subdirectory of
+<tt>lib/</tt> specified by the variable <tt>$/.LIB/</tt>, which is
+presumably set in the <tt>$/Make.include</tt>.
+
+<pre>
+define .RULE-install-copy-C-CXX-INCS
+${1:.=.install-copy}: ${1:.=.install-copy-C-CXX-INCS}
+${1:.=.install-copy-C-CXX-INCS}:
+ $(call .FUN-install-copy,$${${1:.=.C_INCS}} $${${1:.=.CXX_INCS}}, \
+ include/$${${1:.=.INCLUDE/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-C-CXX-INCS,$x)))
+</pre>
+
+If include files are defined in <tt>$/.C_INCS</tt> or <tt>$/.CXX_INCS</tt>
+then these are copied directly to <tt>include/</tt> or one of its
+subdirectories, specified by <tt>$/.INCLUDE/</tt>.
+
+There is a section which builds ps and pdf documents from
+LaTeX files.
+
+<pre>
+%.dvi: %.tex
+ ${-LATEX} cd `dirname $<` && ${LATEX} `basename $<` && ${LATEX} `basename $<`
+
+%.aux: %.tex
+ ${-LATEX} cd `dirname $<` && ${LATEX} `basename $<` && ${LATEX} `basename $<`
+
+%.bbl: %.aux
+ ${-LATEX} cd `dirname $<` && ${BIBTEX} `basename ${<:.aux=}`
+</pre>
+
+These are the commands to invoke LaTeX, based on file pattern.
+
+<pre>
+ALL_TEX_PS :=$(strip $(foreach x,${//},${${x:.=.TEX_PS}}))
+ALL_TEX_PDF :=$(strip $(foreach x,${//},${${x:.=.TEX_PDF}}))
+
+${ALL_TEX_PS} ${ALL_TEX_PDF:.pdf=.ps}: %.ps: %.dvi
+ ${-LATEX} cd `dirname $<` && ${DVIPS} -o `basename $@` `basename $<`
+
+${ALL_TEX_PDF}: %.pdf: %.ps
+ ${-LATEX} ${PS2PDF} $< $@
+__SUBGOALS__+=$${${1:.=.TEX_PS}} $${${1:.=.TEX_PDF}}
+</pre>
+
+The <tt>$/.TEX_PS</tt> and <tt>$/.TEX_PDF</tt> files are added to the
+subgoals. The commands to actually construct ps and pdf files
+have been defined.
+
+<pre>
+define .RULE-install-copy-TEX_PSPDF
+${1:.=.install-copy}: ${1:.=.install-copy-TEX_PSPDF}
+${1:.=.install-copy-TEX_PSPDF}:
+ $(call .FUN-install-copy,$${${1:.=.TEX_PS}},doc/$${${1:.=.DOC/}})
+ $(call .FUN-install-copy,$${${1:.=.TEX_PDF}},doc/$${${1:.=.DOC/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-TEX_PSPDF,$x)))
+</pre>
+
+The ps and pdf files are copied to <tt>doc/</tt> or the
+<tt>$/.DOC/</tt> subdirectory of <tt>doc/</tt>.
+
+<pre>
+define .RULE-install-copy-PYTHON
+${1:.=.install-copy}: ${1:.=.install-copy-PYTHON}
+${1:.=.install-copy-PYTHON}:
+ $(call .FUN-install-copy-script,$${${1:.=.PY_EXES}},\
+ scripts/,\
+ ${PYTHON} ${PYTHON_FLAGS})
+ $(call .FUN-install-copy,$${${1:.=.PY_LIBS}}, \
+ scripts/$${${1:.=.PY_LIB/}})
+
+endef
+$(eval $(foreach x,${//},$(call .RULE-install-copy-PYTHON,$x)))
+</pre>
+
+Python scripts require only copying, but with the shebang mangling
+on the <tt>$/.PY_EXES</tt> files. The <tt>$/.PY_EXES</tt> files get
+copied (and shebang-ed) to <tt>scripts/</tt>, and the
+<tt>$/.PY_LIBS</tt> get copied to <tt>lib/$/.PY_LIB/</tt>.
+
+Similar versions of this rule block exist for <tt>perl</tt> and <tt>sh</tt>
+libraries and executables.
+
+== Anatomy of Make.compilers ==
+
+The <tt>Make.compilers</tt> file (cvs revision 1.50) sets many platform
+dependent variables as well as compiling modes such as debugging or
+profiling. Additionally, the paths for various libraries and
+utilities are set. The first part of the file ascertains the platform
+and build mode, and the rest of the file sets variables based on them.
+This is done in blocks broken down by application rather than platform
+or mode. This will not be a line by line walkthrough. Instead we
+will list important variables being set and what they mean for the
+other parts of the build system.
+
+<pre>
+VALID_OPERATING_SYSTEM:=$(strip \
+ TRU64 \
+ AIX \
+ cygwin \
+ SunOS|foster-city \
+ SunOS|francisco's \
+ FreeBSD \
+ FreeBSD|Randy \
+ Linux|RH7 \
+ Linux|RH9 \
+ default|I-will-take-my-chances \
+)
+</pre>
+
+Our current thinking is that a platform consists of both an operating
+system and possible additional specifications. In our current work
+situation, the admins have defined an environment variable called
+<tt>OPERATING_SYSTEM</tt> for us which we now use as a platform
+identifier, despite the obvious misuse of the word. This variable defines
+those values of that variable which the build system will respect.
+
+<pre>
+ifdef OPERATING_SYSTEM
+ ifneq ($(filter ${OPERATING_SYSTEM},${VALID_OPERATING_SYSTEM}),)
+ _OS|FULL_:=${OPERATING_SYSTEM}
+ _OS_:=$(filter-out |%, $(subst |, |,${_OS|FULL_}))
+ else
+ $(error You are trying to use the build system on a platform where the \
+environment variable OPERATING_SYSTEM is set to an unrecognized value. \
+You should either set \
+OPERATING_SYSTEM to a recognized value, possibly after editing the \
+Make.compilers file of the build system. Currently, the recognized values \
+for OPERATING_SYSTEM are: ${VALID_OPERATING_SYSTEM} )
+ endif
+else
+ $(error You are trying to use the build system on a platform where the \
+environment variable OPERATING_SYSTEM is not set. You should either set \
+OPERATING_SYSTEM to a recognized value, possibly after editing the \
+Make.compilers file of the build system. Currently, the recognized values \
+for OPERATING_SYSTEM are: ${VALID_OPERATING_SYSTEM} )
+endif
+</pre>
+
+The <tt>OPERATING_SYSTEM</tt> variable is parsed into its major identifier,
+in <tt>_OS_</tt> and its full identifier <tt>_OS|FULL_</tt>. This allows us
+to create conditions for the machine architecture based on <tt>_OS_</tt>
+as well as for the specific installation and auxilliary packages on the
+platform based on <tt>_OS|FULL_</tt>.
+
+If the <tt>OPERATING_SYSTEM</tt> variable is not set correctly, then the
+build will abort. We chose this as opposed to some default behavior because
+we figured that if <tt>OPERATING_SYSTEM</tt> was left unset or there was
+some error in its value, that building with the default definitions, instead
+of being alerted to the problem harshly, would waste a lot of time.
+If the user really wants all default behavior, a value of
+<tt>OPERATING_SYSTEM</tt> exists for that.
+
+<pre>
+_CC_:=
+ifdef WITH_GNU
+ _CC_:=-gcc
+endif
+
+_OPT_:=
+ifeq (${WITH_OPT},debug)
+ _OPT_ :=-debug
+endif
+ifeq (${WITH_OPT},profile)
+ _OPT_ :=-prof
+endif
+
+_THR_:=
+ifdef WITH_THREADS
+ _THR_:=-threaded
+endif
+</pre>
+
+If <tt>_CC_</tt> is set to '-gcc'
+then GNU compilers will be used, and if it is empty and native
+compilers will be used.
+There are three <tt>_OPT_</tt> modes: 'debug', 'profile',
+and '' (normal). There is also a <tt>_THR_</tt> variable which determines
+if the applications are to be compiled with threading. A user wishing
+to build with profiling and threading enabled would do something like
+
+<pre>
+$ gmake WITH_OPT=profile WITH_THREADS=1
+</pre>
+
+to turn these options on. One could also set these variable in the
+environment.
+
+<pre>
+# allow additional tag for install directories
+ifdef INSTALL_TAG
+ INSTALL/:=${MAKEFILE/}${_OS_}${_CC_}${_OPT_}${_THR_}-${INSTALL_TAG}/
+else
+ INSTALL/:=${MAKEFILE/}${_OS_}${_CC_}${_OPT_}${_THR_}/
+endif
+</pre>
+
+The <tt>INSTALL/</tt> directory is set. It is based on the location
+of the <tt>Makefile</tt> and the given tags.
+If the user has defined a the variable <tt>INSTALL_TAG</tt> than this
+will be added to the <tt>INSTALL/</tt> directory.
+
+<pre>
+CCDEP :=gcc
+CXXDEP :=g++
+CDEPFLAGS :=-MM -MG
+CXXDEPFLAGS :=-MM -MG
+</pre>
+
+The GNU compilers have much more sophisticated dependency producers than
+the native compilers, so we will use them for all architectures. In
+theory this could cause bugs due to the mismatch between depends and
+build compilers. In practice, it does not.
+
+<pre>
+-CC:=
+-CXX:=
+CC:=gcc
+CXX:=g++
+CFLAGS:=-O2 -g
+CXXFLAGS:=-O2 -g
+CFLAGS_COMPILE:=
+CXXFLAGS_COMPILE:=
+CLDFLAGS:=
+CXXLDFLAGS:=
+CLIBS:=-lm
+CXXLIBS:=-lm
+SHLIB_FLAGS:=-shared
+C_TMP_COMPILE:=
+CXX_TMP_COMPILE:=
+</pre>
+
+Each block begins with the declaration of the variables to be defined
+in that block, set to their default values.
+The <tt>CFLAGS,CXX_FLAGS</tt>
+variables are those compile flags which are needed by bith the
+dependency check and by the actual compile such as
+include paths. The <tt>CFLAGS_COMPILE</tt>
+and <tt>CXXFLAGS_COMPILE</tt> flags are those which are only needed by
+the actual compile, not by the dependency checker, like debugging
+and profiling flags. The <tt>CLDFLAGS,CXXLDFLAGS</tt> are the flags
+for the compiler when functioning as a loader and are placed on
+the loader command line ahead of the object files. The
+<tt>CLIBS,CXXLIBS</tt> are placed on the loader command line after the
+object files. The role of each of these flags is made clear from the
+<tt>Make.rules</tt> file, where they are used.
+
+The <tt>-VAR</tt> variables prefix all action lines of a given category.
+They are designed to allow the build to ignore errors in those actions.
+This was motivated primarily by the fact that some systems did not have
+some key packages installed like LaTeX.
+
+<pre>
+ifeq (${_OS|FULL_},SunOS|francisco's)
+ -LATEX :=-
+endif
+</pre>
+
+This causes a <tt>-</tt> to appear before any one of the actions in the
+LaTeX section of the <tt>Make.rules</tt> file. That <tt>-</tt> will cause
+the build to ignore any errors in the execution of those actions.
+
+The <tt>Make.compilers</tt> file is not just about defining the compilers
+and interpreters on the system. It is also used to make available certain
+architecture dependent package locations.
+
+<pre>
+CFLAGS_LAPACK :=-DFTN_UNDERSCORE -DFTN_LOWERCASE
+CLDFLAGS_LAPACK :=-L/usr/local/lib
+CLIBS_LAPACK :=-llapack -lblas -lm
+ifeq (${_OS_},TRU64)
+# this seems to work for both gcc and non-gcc
+ CLDFLAGS_LAPACK :=
+ CLIBS_LAPACK :=-ldxml
+endif
+ifeq (${_OS_},AIX)
+ CFLAGS_LAPACK :=-DFTN_LOWERCASE
+ CLDFLAGS_LAPACK :=-L/usr/local/ir/lib
+ CLIBS_LAPACK :=-llapack -lessl -lxlf90
+endif
+# sometimes we deploy on Solaris with CDX
+ifeq (${_OS|FULL_},SunOS|foster-city)
+ CLDFLAGS_LAPACK :=-L/home/ross/local/lib
+ CLIBS_LAPACK :=-llapack -lblas -lF77
+endif
+# sometimes we deploy on Solaris on Fancisco's machines
+ifeq (${_OS|FULL_},SunOS|francisco's)
+ CLIBS_LAPACK :=-llapack -lblas -lF77
+endif
+CXXFLAGS_LAPACK :=${CFLAGS_LAPACK}
+CXXLDFLAGS_LAPACK :=${CLDFLAGS_LAPACK}
+CXXLIBS_LAPACK :=${CLIBS_LAPACK}
+</pre>
+
+Some modules use LAPACK. Although these variables do not get used
+in any of the rules of the build system, we define them in
+<tt>Make.compilers</tt> so that they can be used in the various
+<tt>Make.include</tt> files which need LAPACK. This is one case
+there the <tt>_OS|FULL_</tt> is useful, since different platforms
+install LAPACK in all sorts of ways.
+
+<pre>
+PYTHON :=$(shell which python)
+PYTHON_FLAGS :=
+CFLAGS_PY :=-I/usr/local/include/python
+ifeq (${_OS_}${_CC_},TRU64)
+ CFLAGS_PY :=-I/usr/local/ir/Python-2.2.2 -I/usr/local/ir/Python-2.2.2/Include
+ PYTHON :=/usr/local/ir/bin/python
+endif
+ifeq (${_OS_}${_CC_},AIX)
+ CFLAGS_PY :=-I/usr/local/include/python2.2
+ PYTHON :=/usr/local/bin/python
+endif
+ifeq (${_OS_},cygwin)
+ CFLAGS_PY :=-I/usr/include/python2.3
+endif
+CXXFLAGS_PY :=${CFLAGS_PY}
+</pre>
+
+Python and its paths for the known architectures are determined here. On
+unknown architectures we guess where python is based on the user's path.
+
+== Acknowledgements ==
+
+The guilty parties who gave me advice are Nathan Edwards, Dan Fasulo,
+Bjarni Halldorsson, and Clark Mobarry.
+
+== Author ==
+
+Ross Lippert, ripper at ..., 17 Oct 2003.
diff --git a/README.compiling b/README.compiling
new file mode 100644
index 0000000..f8acebb
--- /dev/null
+++ b/README.compiling
@@ -0,0 +1,104 @@
+This guide tells how to compile and install the software.
+
+----------------------------------------
+Quick Instructions:
+
+% gmake install
+
+There! That wasn't tough at all, was it!?
+
+The software is compiled in place, and installed into a directory named
+after the OS/architecture, for example, Linux-amd64.
+
+ESTmapper and A2Amapper are NOT installed by this process; see section 3
+below.
+
+----------------------------------------
+Detailed Instructions:
+
+0) Required Software
+1) Configuration
+2) Compilation
+3) Installation
+4) Other build targets
+
+----------
+0) Required software
+
+Project kmer requires two additional software packages be installed:
+python and gmake.
+
+0.1) Python.
+
+Python (http://www.python.org/) is a freely available programming
+language. It is frequently installed by many OS installations.
+
+Version 2.4+ is recommended.
+Version 2.3 has seen limited testing and seems to work.
+Version 2.2 might work, but is unsupported.
+
+Python is only needed by ATAC/A2Amapper. If python is not installed,
+ATAC/A2Amapper will not be built.
+
+0.2) gmake.
+
+The GNU make program (gmake) is used to build the software. The BSD
+make will not work.
+
+Version 3.81 is strongly recommended.
+Version 3.80 works, but needs to be patched. See build/patches/README.
+
+----------
+1) Configuration
+
+This is optional. It allows compilation for debuging and profiliing.
+
+% sh configure.sh [debug | profile]
+
+Supplying 'debug' as an argument will build debuggable executables.
+
+Supplying 'profile' as an argument will build profiling executables.
+
+Not all architectures support profiling.
+
+If configure.sh reports that your architecture is unsupported, you'll
+have to port...or force it to use, say, linux with "configure.sh
+linux".
+
+If configure.sh reports that python cannot be found, you likely need
+to install python, version 2.3 or 2.4. If you have pyhton installed
+in an unusual location, edit the script.
+
+----------
+2) Compilation
+
+% gmake
+
+gmake v 3.81 or higher is REQUIRED.
+
+If this crashes or returns
+ gmake: *** No rule to make target `.all', needed by `all'. Stop.
+then you unfortunately need to update your gmake to version 3.81.
+
+----------
+3) Installation
+
+'gmake install' will copy all the executables into an OS/architecture
+specific 'bin' directory, for example, into FreeBSD-amd64/bin or
+Linux-i686/bin. ESTmapper and ATAC/A2Amapper are NOT installed by this
+process.
+
+To install ESTmapper, 'cd ESTmapper && sh install.sh location'. This
+will copy the ESTmapper binaries to 'location/bin' and 'location/lib'.
+
+To install ATAC/A2Amapper, 'cd atac-driver && sh install.sh location'.
+This will copy the ATAC/A2Amapper binaries to 'location/bin' and
+'location/lib'.
+
+----------
+4) Other build targets
+
+'gmake clean' will remove the object files, leaving the binaries.
+
+'gmake real-clean' will remove all traces of a build, leaving you
+with (hopefully) a virgin copy of the software.
diff --git a/README.leaff b/README.leaff
new file mode 100644
index 0000000..4ecc814
--- /dev/null
+++ b/README.leaff
@@ -0,0 +1,205 @@
+LEAFF, leaff - sequence library utilities and applications
+
+Described in the publication:
+
+B. Walenz, L. Florea (2010) Sim4db and leaff: Utilities for fast batch
+spliced alignment and sequence indexing, submitted.
+
+
+Copyright (C) 2002, and GNU GPL, PE Corporation (NY) through the Celera Genomics Group
+Copyright (C) 2003-2004, and GNU GPL, Applied Biosystems
+Copyright (C) 2004-2010, and GNU GPL, Brian Walenz
+
+Includes portions copyright from:
+
+kmer - Copyright (C) 2004-2010, and GNU GPL, by Brian Walenz
+
+=======================================================================
+
+Content:
+I. What is leaff?
+II. Guide to using leaff
+III. Examples
+IV. Terms of use
+V. Support
+
+I. What is leaff?
+
+LEAFF (Let's Extract Anything From Fasta) is a utility program for
+working with multi-fasta files. In addition to providing random access
+to the base level, it includes several analysis functions.
+
+II. Guide to using leaff
+
+leaff [-f fasta-file] [options]
+
+SOURCE FILES
+ -f file: use sequence in 'file' (-F is also allowed for historical reasons)
+ -A file: read actions from 'file'
+
+SOURCE FILE EXAMINATION
+ -d: print the number of sequences in the fasta
+ -i name: print an index, labelling the source 'name'
+
+OUTPUT OPTIONS
+ -6 <#>: insert a newline every 60 letters
+ (if the next arg is a number, newlines are inserted every
+ n letters, e.g., -6 80. Disable line breaks with -6 0,
+ or just don't use -6!)
+ -e beg end: Print only the bases from position 'beg' to position 'end'
+ (space based, relative to the FORWARD sequence!) If
+ beg == end, then the entire sequence is printed. It is an
+ error to specify beg > end, or beg > len, or end > len.
+ -ends n Print n bases from each end of the sequence. One input
+ sequence generates two output sequences, with '_5' or '_3'
+ appended to the ID. If 2n >= length of the sequence, the
+ sequence itself is printed, no ends are extracted (they
+ overlap).
+ -C: complement the sequences
+ -H: DON'T print the defline
+ -h: Use the next word as the defline ("-H -H" will reset to the
+ original defline
+ -R: reverse the sequences
+ -u: uppercase all bases
+
+SEQUENCE SELECTION
+ -G n s l: print n randomly generated sequences, 0 < s <= length <= l
+ -L s l: print all sequences such that s <= length < l
+ -N l h: print all sequences such that l <= % N composition < h
+ (NOTE 0.0 <= l < h < 100.0)
+ (NOTE that you cannot print sequences with 100% N
+ This is a useful bug).
+ -q file: print sequences from the seqid list in 'file'
+ -r num: print 'num' randomly picked sequences
+ -s seqid: print the single sequence 'seqid'
+ -S f l: print all the sequences from ID 'f' to 'l' (inclusive)
+ -W: print all sequences (do the whole file)
+
+LONGER HELP
+ -help analysis
+ -help examples
+
+ANALYSIS FUNCTIONS
+ --findduplicates a.fasta
+ Reports sequences that are present more than once. Output
+ is a list of pairs of deflines, separated by a newline.
+
+ --mapduplicates a.fasta b.fasta
+ Builds a map of IIDs from a.fasta and b.fasta that have
+ identical sequences. Format is "IIDa <-> IIDb"
+
+ --md5 a.fasta:
+ Don't print the sequence, but print the md5 checksum
+ (of the entire sequence) followed by the entire defline.
+
+ --partition prefix [ n[gmk]bp | n ] a.fasta
+ --partitionmap [ n[gmk]bp | n ] a.fasta
+ Partition the sequences into roughly equal size pieces of
+ size nbp, nkbp, nmbp or ngbp; or into n roughly equal sized
+ parititions. Sequences larger that the partition size are
+ in a partition by themself. --partitionmap writes a
+ description of the partition to stdout; --partiton creates
+ a fasta file 'prefix-###.fasta' for each partition.
+ Example: -F some.fasta --partition parts 130mbp
+ -F some.fasta --partition parts 16
+
+ --segment prefix n a.fasta
+ Splits the sequences into n files, prefix-###.fasta.
+ Sequences are not reordered; the first n sequences are in
+ the first file, the next n in the second file, etc.
+
+ --gccontent a.fasta
+ Reports the GC content over a sliding window of
+ 3, 5, 11, 51, 101, 201, 501, 1001, 2001 bp.
+
+ --testindex a.fasta
+ Test the index of 'file'. If index is up-to-date, leaff
+ exits successfully, else, leaff exits with code 1. If an
+ index file is supplied, that one is tested, otherwise, the
+ default index file name is used.
+
+ --dumpblocks a.fasta
+ Generates a list of the blocks of N and non-N. Output
+ format is 'base seq# beg end len'. 'N 84 483 485 2' means
+ that a block of 2 N's starts at space-based position 483
+ in sequence ordinal 84. A '.' is the end of sequence
+ marker.
+
+ --errors L N C P a.fasta
+ For every sequence in the input file, generate new
+ sequences including simulated sequencing errors.
+ L -- length of the new sequence. If zero, the length
+ of the original sequence will be used.
+ N -- number of subsequences to generate. If L=0, all
+ subsequences will be the same, and you should use
+ C instead.
+ C -- number of copies to generate. Each of the N
+ subsequences will have C copies, each with different
+ errors.
+ P -- probability of an error.
+
+ HINT: to simulate ESTs from genes, use L=500, N=10, C=10
+ -- make C=10 sequencer runs of N=10 EST sequences
+ of length 500bp each.
+ to simulate mRNA from genes, use L=0, N=10, C=10
+ to simulate reads from genomes, use L=800, N=10, C=1
+ -- of course, N= should be increased to give the
+ appropriate depth of coverage
+
+ --stats a.fasta
+ Reports size statistics; number, N50, sum, largest.
+
+ --seqstore out.seqStore
+ Converts the input file (-f) to a seqStore file (for instance,
+ for use with the Celera assembler or sim4db).
+
+
+NOTES:
+1. Please note that options are ORDER DEPENDENT. Sequences are printed
+ whenever a SEQUENCE SELECTION option occurs on the command line. OUTPUT
+ OPTIONS are not reset when a sequence is printed.
+2. SEQUENCES are numbered starting at ZERO, not one!
+
+
+III. Examples
+
+1. Print the first 10 bases of the fourth sequence in file 'genes':
+ leaff -f genes -e 0 10 -s 3
+
+2. Print the first 10 bases of the fourth and fifth sequences:
+ leaff -f genes -e 0 10 -s 3 -s 4
+
+3. Print the fourth and fifth sequences reverse complemented, and the sixth
+ sequence forward. The second set of -R -C toggle off reverse-complement:
+ leaff -f genes -R -C -s 3 -s 4 -R -C -s 5
+
+4. Convert file 'genes' to a seqStore 'genes.seqStore'.
+ leaff -f genes --seqstore genes.seqStore
+
+
+IV. Terms of use
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received (LICENSE.txt) a copy of the GNU General
+Public License along with this program; if not, you can obtain one from
+http://www.gnu.org/licenses/gpl.txt or by writing to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+V. Support
+
+Brian Walenz (brianwalenz at users.sourceforge.net). Please check the
+parent project's Sourceforge page at http://kmer.sourceforge.net for
+details and updates.
+
+
+Last updated: Jan 19, 2011
diff --git a/README.sim4db b/README.sim4db
new file mode 100644
index 0000000..4bb4b96
--- /dev/null
+++ b/README.sim4db
@@ -0,0 +1,312 @@
+sim4db - batch spliced alignment of cDNA (EST, mRNA) sequences to a
+ target genome, of the same or a related species
+
+Described in the publication:
+
+B. Walenz, L. Florea (2010) Sim4db and leaff: Utilities for fast batch
+spliced alignment and sequence indexing, submitted.
+
+
+Copyright (C) 2002, and GNU GPL, PE Corporation (NY) through the Celera Genomics Group
+Copyright (C) 2003-2004, and GNU GPL, Applied Biosystems
+Copyright (C) 2004-2009, and GNU GPL, Brian Walenz
+Copyright (C) 2010, and GNU GPL, Brian Walenz, Liliana Florea
+
+Includes portions copyright from:
+
+kmer - Copyright (C) 2005-2010, and GNU GPL, by Brian Walenz
+sim4 - Copyright (C) 1998-2003, and GNU GPL, by Liliana Florea
+sim4cc - Copyright (C) 2009-2010, and GNU GPL, by Liliana Florea and Leming Zhou
+GeneSplicer- Copyright (C) 2001-2009, and GNU GPL, by Mihaela Pertea
+Glimmer - Copyright (C) 1998-2009, and GNU GPL, by Arthur Delcher
+
+=======================================================================
+
+Content:
+I. What is sim4db?
+II. Command line usage
+III. Input/Output
+IV. Affiliated tools
+V. Terms of use
+VI. Support
+
+I. What is sim4db?
+
+Sim4db performs fast batch alignment of large cDNA (EST, mRNA) sequence
+sets to a set of eukaryotic genomic regions. It uses the sim4 and sim4cc
+algorithms to determine the alignments, but incorporates a fast sequence
+indexing and retrieval mechanism, implemented in the sister package
+'leaff', to speedily process large volumes of sequences.
+
+While sim4db produces alignments in the same way as sim4 or sim4cc, it
+has additional features to make it more amenable for use with whole-genome
+annotation pipelines. A script file can be used to group pairings between
+cDNAs and their corresponding genomic regions, to be aligned as one run
+and using the same set of parameters. Sim4db also optionally reports more
+than one alignment for the same cDNA within a genomic region, as long
+as they meet user-defined criteria such as minimum length, percentage
+sequence identity or coverage. This feature is instrumental in finding
+all alignments of a gene family at one locus. Lastly, the output is
+presented either as custom sim4db alignments or as GFF3 gene features.
+
+II. Command line usage
+
+A simple command line invocation:
+
+sim4db -genomic g.fasta -cdna c.fasta -scr script -output o.sim4db
+
+where:
+ - 'c.fasta' and 'g.fasta' are the multi-fasta cDNA and genome sequence files
+ - 'script' is a script file indicating individual alignments to be computed
+ - output in sim4db format will be sent to the file 'o.sim4db' ('-' for standard output)
+
+
+A more complex invocation:
+
+sim4db -genomic g.fasta -cdna c.fasta -output o.sim4db [options]
+
+Salient options:
+ -cdna use these cDNA sequences (multi-fasta file)
+ -genomic use these genomic sequences (multi-fasta file)
+ -script use this script file
+ -pairwise sequentially align pairs of sequences
+
+ If none of the '-script' and '-pairwise' options
+ is specified, sim4db performs all-against-all
+ alignments between pairs of cDNA and genomic sequences.
+
+ -output write output to this file
+ -gff3 report output in GFF3 format
+ -interspecies use sim4cc for inter-species alignments (default sim4)
+
+Filter options:
+ -mincoverage iteratively find all exon models with the specified
+ minimum PERCENT COVERAGE
+ -minidentity iteratively find all exon models with the specified
+ minimum PERCENT EXON IDENTITY
+ -minlength iteratively find all exon models with the specified
+ minimum ABSOLUTE COVERAGE (number of bp matched)
+ (default 0)
+ -alwaysreport always report <number> exon models, even if they
+ are below the quality thresholds
+
+ If no mincoverage or minidentity or minlength is given, only
+ the best exon model is returned. This is the DEFAULT operation.
+
+ You will probably want to specify ALL THREE of mincoverage,
+ minidentity and minlength! Don't assume the default values
+ are what you want!
+
+ You will DEFINITELY want to specify at least one of mincoverage,
+ minidentity and minlength with alwaysreport! If you don't,
+ mincoverage will be set to 90 and minidentity to 95 -- to reduce
+ the number of spurious matches when a good match is found.
+
+Auxiliary options:
+ -nodeflines don't include the defline in the sim4db output
+ -alignments print alignments
+
+ -polytails DON'T mask poly-A and poly-T tails
+ -cut trim marginal exons if A/T % > x (poly-AT tails)
+
+ -noncanonical don't force canonical splice sites
+ -splicemodel use the following splice model: 0 - original sim4;
+ 1 - GeneSplicer; 2 - Glimmer; options 1 and 2 are
+ only available with '-interspecies'.
+ Default for sim4 is 0, and for sim4cc is 1.
+
+ -forcestrand Force the strand prediction to always be
+ one of 'forward' or 'reverse'
+
+Execution options:
+ -threads Use n threads.
+ -touch create this file when the program finishes execution
+
+Debugging options:
+ -v print status to stderr while running
+ -V print script lines (stderr) as they are being processed
+
+Developer options:
+ -Z set the spaced seed pattern
+ -H set the relink weight factor (H=1000 recommended for mRNAs)
+ -K set the first MSP threshold
+ -C set the second MSP threshold
+ -Ma set the limit of the number of MSPs allowed
+ -Mp same, as percentage of bases in cDNA
+ NOTE: If used, both -Ma and -Mp must be specified!
+
+
+III. Input/Output
+
+For a typical run, sim4db takes as input two multi-fasta files containing
+the cDNAs and the genome, respectively, and optionally a script describing
+a set of pairings among the sequences. Alignments are determined using
+the program sim4 (default) for same-species comparisons, or sim4cc when
+the '-interspecies' option is set. The output is reported in the compact
+sim4db format (default), or in GFF3 format with the '-gff3' option.
+Utilities for filtering, merging, sorting and processing polishes in
+these formats, and for converting between the two formats (lossy),
+are included with the package and described in section IV below.
+
+A. The input script file format
+
+[-f|-r] -e ESTidx -D GENidx GENlo GENhi
+where:
+cDNAidx internal index of the cDNA in the input cDNA fasta file
+ (0..#cDNAseqs-1)
+GENidx internal index of the genomic sequence in the input genome
+ file (0..#GENseqs-1)
+-f use the cDNA sequence as is
+-r use the reverse complement of the cDNA sequence
+GENlo, GENhi begin and end coordinates of the target genomic region;
+ coordinates are 0-based
+
+
+Example:
+-f -e 61728 -D 0 2370482 2375224
+-r -e 61730 -D 0 6723331 6757701
+-r -e 61734 -D 1 8428517 8432981
+-f -e 61736 -D 3 4600260 4637694
+etc.
+
+For best performance, the script should be sorted by the genomic
+sequence index.
+
+
+B. The sim4db output format
+
+sim4begin
+cDNAidx[cDNAlen-pA-pT] GENidx[GENoff-GENlen] <M-N-O-P-S>
+edef=cDNA defline
+ddef=genomic defline
+cDNAbgn1-cDNAend1 (GENbgn1-GENend1) <M-N-P> intronOri
+cDNAbgn2-cDNAend2 (GENbgn2-GENend2) <M-N-P> intronOri
+...
+cDNAbgnn-cDNAendn (GENbgnn-GENendn) <M-N-P> intronOri
+cDNA alignment sequence for exon #1
+genomic alignment sequence for exon #1
+cDNA alignment sequence for exon #2
+genomic alignment sequence for exon #2
+...
+cDNA alignment sequence for exon #n
+genomic alignment sequence for exon #n
+sim4end
+
+where:
+cDNAidx internal index of the cDNA in the input cDNA fasta file
+cDNAlen length of the cDNA sequence
+pA(T)wi length of polyA(T) tail detected and masked
+GENidx internal index of the genomic sequence in the genome fasta file
+GENoff offset to the beginning of the genomic region containing the signal
+GENlen length of the genomic region containing the signal
+M number of nucleotide matches in the alignment
+N number of matching N's in the alignment
+P percent sequence identity of the alignment
+O match orientation:
+
+ * forward - the cDNA sequence aligns to the genomic sequence directly
+ * complement - the reverse complement of the cDNA sequence matches
+ the genomic sequence; this is the equivalent of the
+ sim4 '(complement)' output line
+
+S strand predicted based on the splice signals and alignment quality:
+
+ * forward - predicted forward strand
+ * reverse - predicted reverse strand
+ * unknown - strand unknown (because of low alignment quality,
+ single exon match, or weak splice signals)
+
+cDNAbgni start position of exon i in the cDNA sequence
+cDNAendi end position of exon i in the cDNA sequence
+GENbgni start position of exon i in the genomic sequence (interval GENlo-GENhi)
+GENendi end position of exon i in the genomic sequence (interval GENlo-GENhi)
+M number of nucleotide matches in the alignment
+N number of matching N's in the alignment
+P percent sequence identity of the alignment
+intronOri predicted orientation of the intron:
+
+ * -> forward (i.e., GT-AG-like splice signals)
+ * <- reverse (i.e., CT-AC-like splice signals)
+ * -- ambiguous
+ * == gap (unaligned portion) in the cDNA sequence
+
+Exon coordinates are nucleotide based, starting from 1. Genomic
+coordinates are always in the original sequence, while the cDNA
+coordinates will refer to positions in the reverse complement of the
+sequence if the match orientation is indicated as 'complement'.
+
+Lowercase letters in the alignment lines indicate positions with
+matching nucleotides, '-' indicate gaps in the corresponding sequence,
+and uppercase letters mark either substitutions, or gaps in the other
+sequence. Alignments are OPTIONAL.
+
+Example:
+
+sim4begin
+61728[685-0-21] 0[2370482-4742] <651-0-97-forward-reverse>
+edef=gb|CA807305 D. melanogaster cDNA 3' similar to CT12127, mRNA sequence
+ddef=arm_2L
+22-337 (2372455-2372770) <313-0-99> <-
+338-584 (2372830-2373076) <238-0-95> <-
+585-685 (2373134-2373234) <100-0-99>
+gtaaaaaTttctgtttatta...gggcgaccagaagtcaatcag
+gtaaaaaGttctgtttatta...gggcgaccagaagtcaatcag
+ggtaacttgtccttGggtgc...ccacaccgGctccca-ttcgcgtAtc
+ggtaacttgtccttTggtgc...ccacaccgCctcccaGttcgcgtTtc
+tgcaagcggtcgacatgagg...cttaaAgcgctggta
+tgcaagcggtcgacatgagg...cttaaCgcgctggta
+sim4end
+
+C. The GFF3 output format
+
+The same example as before:
+
+0:arm_2L sim4db mRNA 2372455 2373234 97 - . ID=sim4db10;Name=61728:gb|CA807305;Target=61728:gb|CA807305 22 685 +;pA=0;pT=21;genRegion=2370482-2375224
+0:arm_2L sim4db exon 2372455 2372770 99 - . Parent=sim4db10;Target=61728:gb|CA807305 22 337 +;Gap=M316;nMatches=313;intron=<-
+0:arm_2L sim4db exon 2372830 2373076 95 - . Parent=sim4db10;Target=61728:gb|CA807305 338 584 +;Gap=M74 D1 M2 I1 M160 D1 M10;nMatches=238;intron=<-
+0:arm_2L sim4db exon 2373134 2373234 99 - . Parent=sim4db10;Target=61728:gb|CA807305 585 685 +;Gap=M101;nMatches=100
+
+(Columns are tab-separated.)
+
+IV. Affiliated tools
+
+The 'sim4dbutils' package contains a range of utilities to work with
+sim4db-generated alignment files, of particular note being:
+
+convertPolishes - convert between the two formats. With GFF3->sim4db
+ conversion, alignments will be lost.
+filterPolishes - filter alignments based on minimum percentage sequence
+ identity, coverage and length.
+mergePolishes - merge alignments from multiple files (also concatenates
+ the cDNA fasta files)
+sortPolishes - sort alignments by cDNA or genomic sequence index,
+ using a limited amount of memory if needed.
+
+
+V. Terms of use
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received (LICENSE.txt) a copy of the GNU General
+Public License along with this program; if not, you can obtain one from
+http://www.gnu.org/licenses/gpl.txt or by writing to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+VI. Support
+
+Brian Walenz (high-throughput; brianwalenz at users.sourceforge.net) and
+Liliana Florea (sim4/sim4cc; florea at users.sourceforge.net). Please
+check the parent project's Sourceforge page at http://kmer.sourceforge.net
+for details and updates.
+
+
+Last updated: Jan 19, 2011
diff --git a/atac-driver/Make.include b/atac-driver/Make.include
new file mode 100644
index 0000000..0a44e5d
--- /dev/null
+++ b/atac-driver/Make.include
@@ -0,0 +1,17 @@
+# -*- makefile -*-
+
+$(eval $(call Include,$/libatac/))
+
+$(eval $(call Include,$/alignOverlap/))
+$(eval $(call Include,$/gapShifter/))
+$(eval $(call Include,$/lengthFilter/))
+$(eval $(call Include,$/matchExtender/))
+$(eval $(call Include,$/mismatchCounter/))
+$(eval $(call Include,$/statsGenerator/))
+$(eval $(call Include,$/uniqueFilter/))
+$(eval $(call Include,$/clumpMaker/))
+$(eval $(call Include,$/chainer/))
+$(eval $(call Include,$/chimera/))
+
+$/.PERL_EXES := $/atac.pl \
+ $/makeplot.pl
diff --git a/atac-driver/alignOverlap/Make.include b/atac-driver/alignOverlap/Make.include
new file mode 100644
index 0000000..d7dd548
--- /dev/null
+++ b/atac-driver/alignOverlap/Make.include
@@ -0,0 +1,35 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBSEQ/ :=$(realpath $/../../libseq/)/
+LIBATAC/ :=$(realpath $/../libatac/)/
+THIS/ :=$(realpath $/)/
+
+$/.CXX_SRCS := $/overlap.C $/overlap-sort.C $/overlap-printAnno.C $/overlap-find.C
+$/.CXX_EXES := $/overlap
+
+$/.CLEAN :=$/*.o $/*~ $/core $/overlap-process1.C $/overlap-process2.C
+
+$/overlap: $/overlap.o \
+ $/overlap-find.o \
+ $/overlap-matchTree.o \
+ $/overlap-printAnno.o \
+ $/overlap-sort.o \
+ $/overlap-process1.o \
+ $/overlap-process2.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+
+# Symlinks below don't work, they either get the source or the destination wrong.
+# Hardlinks aren't as obvious as to what's going on.
+
+$/overlap-process1.o: CXXFLAGS+=-DINDEX=1 -DNAME=process1 -DPOS1=pos1 -DPOS2=pos2 -DLEN2=len2
+$/overlap-process1.C: $/overlap-process.C
+ ln -f ${THIS/}overlap-process.C ${THIS/}overlap-process1.C
+
+$/overlap-process2.o: CXXFLAGS+=-DINDEX=2 -DNAME=process2 -DPOS1=pos2 -DPOS2=pos1 -DLEN2=len1
+$/overlap-process2.C: $/overlap-process.C
+ ln -f ${THIS/}overlap-process.C ${THIS/}overlap-process2.C
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBATAC/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/atac-driver/alignOverlap/findDifferentScaffold.pl b/atac-driver/alignOverlap/findDifferentScaffold.pl
new file mode 100644
index 0000000..0b25c8e
--- /dev/null
+++ b/atac-driver/alignOverlap/findDifferentScaffold.pl
@@ -0,0 +1,129 @@
+#!/usr/bin/perl
+
+use strict;
+
+# Examines an atac mapping, counts the number of times a scaffold is
+# mapped to wildly different places -- wildly being more than a few
+# bp away on the same chromosome (mind the gap, please!) or (gasp!) a
+# different chromosome.
+#
+# Assumes that the Aannotation primary axis is chromosomes.
+#
+# Change the first . in the m// below to restrict to specific types
+# of regions, e.g., N's. Useful choices here are:
+# . - all regions
+# U - unmapped (will do nothing)
+# 1 - has only one mapping
+# Y - just those they agree on
+# N - disagree, but on the same destination
+# ! - disagree, and on different destinations
+# ? - inconsistent mapping (OK, this one isn't useful)
+#
+# 1Y - the regions that have no disagreement
+# 1YN! - all consistent regions
+
+if (scalar(@ARGV != 2)) {
+ print STDERR "usage: $0 some.Aannotation outprefix\n";
+ exit(1);
+}
+
+my $filename = shift @ARGV;
+my $outprefix = shift @ARGV;
+
+# scafA is really scaffolds-from-map1, and scafB is scaffolds-from-map2.
+
+my (%scafA_to_chr, %scafA_to_chr_mismatch, $scafAlen);
+my (%scafB_to_chr, %scafB_to_chr_mismatch, $scafBlen);
+
+open(F, "< $filename");
+while (<F>) {
+ chomp;
+
+ if (m/^[1YN!]\s+(\d+):(\d+)-(\d+)\[\s*\d+\].*\s(\d+):\s*(\d+)-\s*(\d+)\).*\s(\d+):\s*(\d+)-\s*(\d+)\)/) {
+ my ($id1, $b1, $e1) = ($1, $2, $3);
+ my ($id2a, $b2a, $e2a) = ($4, $5, $6);
+ my ($id2b, $b2b, $e2b) = ($7, $8, $9);
+
+ # If we have a mapping from method A or method B, save the
+ # chromosome that the scaffold mapped to. If we've already
+ # mapped this scaffold to some other chromosome, call it a
+ # mismatch.
+
+ if (($id2a > 0) && ($e2a > 0)) {
+ if (defined($scafA_to_chr{$id2a})) {
+ if ($scafA_to_chr{$id2a} != $id1) {
+ $scafA_to_chr_mismatch{$id2a} = $scafA_to_chr{$id2a} if (! defined($scafA_to_chr_mismatch{$id2a}));
+ $scafA_to_chr_mismatch{$id2a} .= "\1$id1\0$_";
+ }
+ } else {
+ $scafA_to_chr{$id2a} = "$id1\0$_";
+ }
+ }
+
+ if (($id2b > 0) && ($e2b > 0)) {
+ if (defined($scafB_to_chr{$id2b})) {
+ if ($scafB_to_chr{$id2b} != $id1) {
+ $scafB_to_chr_mismatch{$id2b} = $scafB_to_chr{$id2b} if (! defined($scafB_to_chr_mismatch{$id2b}));
+ $scafB_to_chr_mismatch{$id2b} .= "\1$id1\0$_";
+ }
+ } else {
+ $scafB_to_chr{$id2b} = "$id1\0$_";
+ }
+ }
+ }
+}
+close(F);
+
+# Count the number of things in *_mismatch that are the same.
+#
+my %merge;
+my $both;
+foreach my $f (keys %scafA_to_chr_mismatch) {
+ $merge{$f}++;
+}
+foreach my $f (keys %scafB_to_chr_mismatch) {
+ $merge{$f}++;
+}
+foreach my $f (keys %merge) {
+ if (defined($scafA_to_chr_mismatch{$f}) && defined($scafB_to_chr_mismatch{$f})) {
+ $both++;
+ }
+}
+
+print "num scafA: ", scalar(keys %scafA_to_chr_mismatch), "\n";
+print "num scafB: ", scalar(keys %scafB_to_chr_mismatch), "\n";
+print "num both: ", $both, "\n";
+
+# Run through the input again, pulling out matches that map a
+# single scaffold to two different chromosomes, then what? we
+# saved the iid of the scaffold that maps to different
+# chromosomes as the key, so just parse the matches again,
+# pulling out all those scaffolds.
+
+open(A, "| sort -k6n -k7n > $outprefix.scaffold-consistency.map1dups");
+open(B, "| sort -k11n -k12n > $outprefix.scaffold-consistency.map2dups");
+my $matchesA = 0;
+my $matchesB = 0;
+open(F, "< $filename");
+while (<F>) {
+ if (m/^.\s+(\d+):(\d+)-(\d+)\[\s*\d+\].*\s(\d+):\s*(\d+)-\s*(\d+)\).*\s(\d+):\s*(\d+)-\s*(\d+)\)/) {
+ my ($id1, $b1, $e1) = ($1, $2, $3);
+ my ($id2a, $b2a, $e2a) = ($4, $5, $6);
+ my ($id2b, $b2b, $e2b) = ($7, $8, $9);
+
+ if (defined($scafA_to_chr_mismatch{$id2a}) || defined($scafA_to_chr_mismatch{$id2b})) {
+ print A $_;
+ $matchesA++;
+ }
+
+ if (defined($scafB_to_chr_mismatch{$id2a}) || defined($scafB_to_chr_mismatch{$id2b})) {
+ print B $_;
+ $matchesB++;
+ }
+ }
+}
+close(A);
+close(B);
+
+print STDERR "matches for map1: $matchesA\n";
+print STDERR "matches for map2: $matchesB\n";
diff --git a/atac-driver/alignOverlap/overlap-annoList.H b/atac-driver/alignOverlap/overlap-annoList.H
new file mode 100644
index 0000000..0ce2bdf
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap-annoList.H
@@ -0,0 +1,66 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef OVERLAP_ANNOLIST_H
+#define OVERLAP_ANNOLIST_H
+
+#include "overlap.H"
+
+
+// List of the annotation. Used for classifying each piece of the
+// annotation, e.g., U followed by 1 followed by U means that
+// somebody really did map something uniquely, where Y followed by 1
+// is probably just an extension.
+//
+// This only works if assemblyA is the reference!
+//
+class annoList {
+public:
+ char type;
+ uint32 iid1, pos1, len1; // The position on the reference axis
+ uint32 iid2a, pos2a, len2a; // The position on mapping 1
+ uint32 iid2b, pos2b, len2b; // The position on mapping 2
+
+ void add(char type_,
+ uint32 iid1_, uint32 pos1_, uint32 len1_,
+ uint32 match1, atacMatch *m1,
+ uint32 match2, atacMatch *m2) {
+ type = type_;
+ iid1 = iid1_;
+ pos1 = pos1_;
+ len1 = len1_;
+
+ iid2a = match1;
+ pos2a = 0;
+ len2a = 0;
+ if (m1) {
+ pos2a = m1->pos2;
+ len2a = m1->len2;
+ }
+
+ iid2b = match2;
+ pos2b = 0;
+ len2b = 0;
+ if (m2) {
+ pos2b = m2->pos2;
+ len2b = m2->len2;
+ }
+ }
+};
+
+#endif // OVERLAP_ANNOLIST_H
diff --git a/atac-driver/alignOverlap/overlap-find.C b/atac-driver/alignOverlap/overlap-find.C
new file mode 100644
index 0000000..5fce0a3
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap-find.C
@@ -0,0 +1,259 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include "overlap.H"
+
+// Looks for 1's surrounded by U's
+void
+findIsolatedUnique(annoList *AL, uint32 ALlen) {
+ bool only1 = true;
+ uint32 sumA = 0, tA=0;
+ uint32 sumB = 0, tB=0;
+
+ for (uint32 i=1; i<ALlen; i++) {
+ if (AL[i].type == 'U') {
+ if (only1 && (!tA || !tB)) {
+ sumA += tA;
+ sumB += tB;
+ }
+ tA = 0;
+ tB = 0;
+ only1 = true;
+ } else if (AL[i].type != '1') {
+ only1 = false;
+ } else {
+ tA += AL[i].len2a;
+ tB += AL[i].len2b;
+ }
+ }
+
+ fprintf(stderr, "isolated Unique: map1: "uint32FMT" map2: "uint32FMT"\n", sumA, sumB);
+}
+
+
+
+uint32 encodeType(uint32 type) {
+ uint32 t = 9;
+ if (type == 'U') t = 0;
+ else if (type == '1') t = 1;
+ else if (type == 'Y') t = 2;
+ else if (type == 'N') t = 3;
+ else if (type == '?') t = 4;
+ else if (type == '!') t = 5;
+ if (t == 9)
+ fprintf(stderr, "got invalid type; "uint32FMT" -- %c\n", type, (char)type), exit(1);
+ return(t);
+}
+
+
+
+// Looks for 1's at the end of a Y, that have the same match id
+void
+findExtended(annoList *AL, uint32 ALlen) {
+
+ class stat_s {
+ public:
+ uint32 count;
+ uint32 len;
+
+ stat_s() {
+ count = 0;
+ len = 0;
+ };
+
+ stat_s &operator+=(uint32 x) {
+ count++;
+ len += x;
+ return(*this);
+ };
+ stat_s &operator+=(stat_s &x) {
+ count += x.count;
+ len += x.len;
+ return(*this);
+ };
+
+ void print(char const *msg) {
+ fprintf(stderr, "%s: "uint32FMT" len:"uint32FMT"\n", msg, count, len);
+ };
+ };
+
+ stat_s count1[6][6][2][2][2];
+ stat_s count2[6][6][2][2][2];
+
+ // Look forward for the next event:
+ // non-1
+ // the iid changed
+ //
+ // End is the match after the 1, beg is the match before the 1.
+ //
+ for (uint32 i=0; i<ALlen; i++) {
+ if (AL[i].type == '1') {
+ uint32 beg = i-1;
+ uint32 gap = i;
+ uint32 end = i;
+ uint32 len = 0;
+
+ if (AL[gap].iid2a) {
+ while ((AL[gap].iid2a == AL[end].iid2a) && (AL[end].type == '1')) {
+ len += AL[end].len1;
+ end++;
+ }
+ } else {
+ while ((AL[gap].iid2b == AL[end].iid2b) && (AL[end].type == '1')) {
+ len += AL[end].len1;
+ end++;
+ }
+ }
+
+ if (beg == gap)
+ fprintf(stderr, "beg == gap?\n"), exit(1);
+ if (gap == end)
+ fprintf(stderr, "end == gap?\n"), exit(1);
+
+ uint32 tbeg = encodeType(AL[beg].type);
+ uint32 tend = encodeType(AL[end].type);
+
+ bool mbeg, mend, moth;
+ if (AL[gap].iid2a) {
+ mbeg = (AL[beg].iid2a == AL[gap].iid2a);
+ mend = (AL[end].iid2a == AL[gap].iid2a);
+ moth = (AL[beg].iid2b == AL[end].iid2b);
+ count1[tbeg][tend][mbeg][mend][moth] += len;
+ } else {
+ mbeg = (AL[beg].iid2b == AL[gap].iid2b);
+ mend = (AL[end].iid2b == AL[gap].iid2b);
+ moth = (AL[beg].iid2a == AL[end].iid2a);
+ count2[tbeg][tend][mbeg][mend][moth] += len;
+ }
+ }
+ }
+
+
+ char label[6] = {'U', '1', 'Y', 'N', '?', '!'};
+
+#define TYPE_U 0
+#define TYPE_1 1
+#define TYPE_Y 2
+#define TYPE_N 3
+#define TYPE_Q 4
+#define TYPE_E 5
+
+ // If the other iid is the same, then these are all interesting cases
+ // Maybe not.
+#if 0
+ stat_s oth_cnst_1;
+ stat_s oth_cnst_2;
+ stat_s oth_diff_1;
+ stat_s oth_diff_2;
+
+ for (uint32 i=0; i<6; i++)
+ for (uint32 j=0; j<6; j++)
+ for (uint32 k=0; k<2; k++)
+ for (uint32 l=0; l<2; l++) {
+ oth_cnst_1 += count1[i][j][k][l][1];
+ oth_cnst_2 += count2[i][j][k][l][1];
+ oth_diff_1 += count1[i][j][k][l][0];
+ oth_diff_2 += count2[i][j][k][l][0];
+ }
+ fprintf(stderr, "count1 other iid constant: "uint32FMT" len:"uint32FMT"\n", oth_cnst_1.count, oth_cnst_1.len);
+ fprintf(stderr, "count2 other iid constant: "uint32FMT" len:"uint32FMT"\n", oth_cnst_2.count, oth_cnst_2.len);
+ fprintf(stderr, "count1 other iid different: "uint32FMT" len:"uint32FMT"\n", oth_diff_1.count, oth_diff_1.len);
+ fprintf(stderr, "count2 other iid different: "uint32FMT" len:"uint32FMT"\n", oth_diff_2.count, oth_diff_2.len);
+#endif
+
+#if 0
+ for (uint32 tbeg=0; tbeg<6; tbeg++)
+ for (uint32 tend=0; tend<6; tend++)
+ for (uint32 mbeg=0; mbeg<2; mbeg++)
+ for (uint32 mend=0; mend<2; mend++)
+ for (uint32 moth=0; moth<2; moth++) {
+ }
+#endif
+
+
+ // Look for things with an extra piece in the middle -- mbeg and mend
+ //
+ stat_s extraMid1[7];
+ stat_s extraMid2[7];
+
+ for (uint32 tbeg=0; tbeg<6; tbeg++)
+ for (uint32 tend=0; tend<6; tend++)
+ for (uint32 mbeg=0; mbeg<2; mbeg++)
+ for (uint32 mend=0; mend<2; mend++)
+ for (uint32 moth=0; moth<2; moth++) {
+ if (mbeg && mend) {
+ extraMid1[6] += count1[tbeg][tend][mbeg][mend][moth];
+ extraMid2[6] += count2[tbeg][tend][mbeg][mend][moth];
+
+ if (tbeg == tend) {
+ extraMid1[tbeg] += count1[tbeg][tend][mbeg][mend][moth];
+ extraMid2[tbeg] += count2[tbeg][tend][mbeg][mend][moth];
+ }
+ }
+ }
+
+ extraMid1[6].print("extra middle 1");
+ extraMid2[6].print("extra middle 2");
+ for (uint32 i=0; i<6; i++) {
+ char l[64];
+ sprintf(l, "extra middle 1 %c", label[i]);
+ extraMid1[i].print(l);
+ sprintf(l, "extra middle 2 %c", label[i]);
+ extraMid2[i].print(l);
+ }
+
+ // Look for true extensions
+ //
+ stat_s extY1;
+ stat_s extN1;
+ stat_s extY2;
+ stat_s extN2;
+
+ for (uint32 tbeg=0; tbeg<6; tbeg++)
+ for (uint32 tend=0; tend<6; tend++)
+ for (uint32 mbeg=0; mbeg<2; mbeg++)
+ for (uint32 mend=0; mend<2; mend++)
+ for (uint32 moth=0; moth<2; moth++) {
+ if (mbeg && !mend) {
+ if (tbeg == TYPE_Y) {
+ extY1 += count1[tbeg][tend][mbeg][mend][moth];
+ } else {
+ extN1 += count1[tbeg][tend][mbeg][mend][moth];
+ }
+ }
+ if (!mbeg && mend) {
+ if (tend == TYPE_Y) {
+ extY2 += count2[tbeg][tend][mbeg][mend][moth];
+ } else {
+ extN2 += count2[tbeg][tend][mbeg][mend][moth];
+ }
+ }
+ }
+ extY1.print("extension Y 1");
+ extN1.print("extension N 1");
+ extY2.print("extension Y 2");
+ extN2.print("extension N 2");
+
+
+ fprintf(stderr, "----------------------------------------\n");
+ fprintf(stderr, "BEGIN IID SAME, END IID SAME, OTHER IID SAME\n");
+ fprintf(stderr, "----------------------------------------\n");
+ fprintf(stderr, "END IID\n");
+ fprintf(stderr, "----------------------------------------\n");
+ fprintf(stderr, "OTHER IID\n");
+}
diff --git a/atac-driver/alignOverlap/overlap-matchTree.C b/atac-driver/alignOverlap/overlap-matchTree.C
new file mode 100644
index 0000000..3c3f660
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap-matchTree.C
@@ -0,0 +1,69 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include "overlap.H"
+
+matchTree::matchTree(atacMatchList *L, uint32 side) {
+
+ // Construct a list of pointers to the atacMatchList data
+ //
+ // kazlib was modified to be qsort() compatible and so it passes a
+ // pointer to whatever it is sorting. Since kazlib operates on
+ // pointers anyway, this means that it passes the compare function
+ // a pointer to a pointer to the object.
+ //
+ // Which really fails in this case. We have a list of pointers to
+ // objects that we sort, then want to load.
+ //
+ // Uhhh, no, this is correct. We give kazlib a pointer to the
+ // object, it gives the compare function a pointer to that pointer.
+ //
+ // qsort() below sorts pointers to objects, and does the same.
+
+ atacMatch **matchPointers = new atacMatch * [L->numberOfMatches()];
+ for (uint32 i=0; i<L->numberOfMatches(); i++)
+ matchPointers[i] = L->getMatch(i);
+
+ // Choose a comparison function based on the side we want
+
+ int (*sortMatches)(const void *, const void *) = sortMatches1;
+ if (side == 1)
+ sortMatches = sortMatches2;
+
+ // Sort
+
+ qsort(matchPointers, L->numberOfMatches(), sizeof(atacMatch *), sortMatches);
+
+ // Load the tree (use DICTCOUNT_T_MAX for max nodes)
+
+ _tree = dict_create(L->numberOfMatches(), sortMatches);
+ dict_allow_dupes(_tree);
+
+ dict_load_begin(&_load, _tree);
+
+ for (uint32 i=0; i<L->numberOfMatches(); i++) {
+ dnode_t *node = (dnode_t *)malloc(sizeof(dnode_t));
+ dnode_init(node, 0L);
+ dict_load_next(&_load, node, matchPointers[i]);
+ }
+
+ dict_load_end(&_load);
+
+ // Clean up
+ delete [] matchPointers;
+}
diff --git a/atac-driver/alignOverlap/overlap-matchTree.H b/atac-driver/alignOverlap/overlap-matchTree.H
new file mode 100644
index 0000000..5450743
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap-matchTree.H
@@ -0,0 +1,38 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef OVERLAP_MATCHTREE_H
+#define OVERLAP_MATCHTREE_H
+
+#include "overlap.H"
+
+// Contructs a search tree from a atacMatchList
+class matchTree {
+public:
+ matchTree(atacMatchList *L, uint32 side);
+ ~matchTree() {
+ dict_free_nodes(_tree);
+ dict_free(_tree);
+ };
+
+ dict_t *_tree;
+ dict_load_t _load;
+};
+
+
+#endif // OVERLAP_MATCHTREE_H
diff --git a/atac-driver/alignOverlap/overlap-printAnno.C b/atac-driver/alignOverlap/overlap-printAnno.C
new file mode 100644
index 0000000..64c6753
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap-printAnno.C
@@ -0,0 +1,95 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include "overlap.H"
+
+void
+printAnno(FILE *F, annoList *AL, uint32 &ALlen,
+ char label,
+ uint32 axis,
+ span_t *span,
+ uint32 match1, atacMatch *m1,
+ uint32 match2, atacMatch *m2) {
+
+ // If we're just given match1, make it match2 if it is the second mapping
+ //
+ if ((match1 >> COLORSHIFT) && (match2 == uint32ZERO)) {
+ match2 = match1; m2 = m1;
+ match1 = 0; m1 = 0;
+ }
+
+ uint32 len = span->_end - span->_beg;
+
+ // axis is 1 or 2; if we're the first axis (B35 centric) make a
+ // list of the matches for later processing
+
+ if (axis == 1)
+ AL[ALlen++].add(label, span->_iid, span->_beg, len,
+ match1 & COLORMASK, m1,
+ match2 & COLORMASK, m2);
+
+ fprintf(F, "%c "uint32FMTW(4)":"uint32FMTW(09)"-"uint32FMTW(09)"["uint32FMTW(6)"] ",
+ label,
+ span->_iid, span->_beg, span->_end, len);
+
+ if (m1) {
+ fprintf(F, "%s ", m1->matchuid);
+ uint32 off1 = span->_beg - m1->pos1;
+
+ if (axis == 1) {
+ uint32 sta = m1->pos2 + off1;
+ uint32 end = m1->pos2 + off1 + len;
+
+ if (m1->fwd2 == 0) {
+ sta = m1->pos2 + m1->len2 - off1;
+ end = m1->pos2 + m1->len2 - off1 - len;
+ }
+
+ fprintf(F, "("uint32FMTW(8)": "uint32FMTW(9)"-"uint32FMTW(9)") ", m1->iid2, sta, end);
+ } else {
+ fprintf(F, "("uint32FMTW(8)": "uint32FMTW(9)"-"uint32FMTW(9)") ", m1->iid1, m1->pos1 + off1, m1->pos1 + off1 + len);
+ }
+ } else {
+ fprintf(F, uint32FMTW(07)" ", uint32ZERO);
+ fprintf(F, "("uint32FMTW(8)": "uint32FMTW(9)"-"uint32FMTW(9)") ", uint32ZERO, uint32ZERO, uint32ZERO);
+ }
+
+ if (m2) {
+ fprintf(F, "%s ", m2->matchuid);
+ uint32 off2 = span->_beg - m2->pos1;
+
+ if (axis == 1) {
+ uint32 sta = m2->pos2 + off2;
+ uint32 end = m2->pos2 + off2 + len;
+
+ if (m2->fwd2 == 0) {
+ sta = m2->pos2 + m2->len2 - off2;
+ end = m2->pos2 + m2->len2 - off2 - len;
+ }
+
+ fprintf(F, "("uint32FMTW(8)": "uint32FMTW(9)"-"uint32FMTW(9)") ", m2->iid2, sta, end);
+ } else {
+ fprintf(F, "("uint32FMTW(8)": "uint32FMTW(9)"-"uint32FMTW(9)") ", m2->iid1, m2->pos1 + off2, m2->pos1 + off2 + len);
+ }
+ } else {
+ fprintf(F, uint32FMTW(07)" ", uint32ZERO);
+ fprintf(F, "("uint32FMTW(8)": "uint32FMTW(9)"-"uint32FMTW(9)") ", uint32ZERO, uint32ZERO, uint32ZERO);
+ }
+
+ fprintf(F, "\n");
+}
diff --git a/atac-driver/alignOverlap/overlap-process.C b/atac-driver/alignOverlap/overlap-process.C
new file mode 100644
index 0000000..bc67f3e
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap-process.C
@@ -0,0 +1,105 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include "overlap.H"
+
+// I really wanted this to be parameterized with two macros, but the
+// preprocessor merges, then replaces:
+// #define INDEXA 1
+// #define INDEXB 2
+// #define NODE node ## INDEXA
+// results in 'nodeINDEXA' not 'node1'
+
+void
+NAME(FILE *outfile,
+ spanTree *S,
+ atacMatchList *M1,
+ atacMatchList *M2,
+ overlapStats &stats,
+ annoList *AL,
+ uint32 &ALlen,
+ uint32 &ALmax) {
+
+ dnode_t *node = dict_first(S->_tree);
+
+ while (node) {
+ span_t *span = (span_t *)dnode_getkey(node);
+ uint32 spanLen = span->_end - span->_beg;
+
+ if (span->_matchesLen == 0) {
+ stats.unmapped += spanLen;
+ printAnno(outfile, AL, ALlen, 'U', INDEX, span);
+ } else if (span->_matchesLen == 1) {
+ uint32 match = span->_matches[0];
+ atacMatch *m;
+
+ if (match >> COLORSHIFT) {
+ m = M2->getMatch(match & COLORMASK);
+ stats.map2unique += spanLen;
+ } else {
+ m = M1->getMatch(match & COLORMASK);
+ stats.map1unique += spanLen;
+ }
+
+ printAnno(outfile, AL, ALlen, '1', INDEX, span, match, m);
+ } else if ((span->_matchesLen == 2) &&
+ ((span->_matches[0] >> COLORSHIFT) == (span->_matches[1] >> COLORSHIFT))) {
+ stats.inconsistent += spanLen;
+ printAnno(outfile, AL, ALlen, '?', INDEX, span);
+ } else if (span->_matchesLen == 2) {
+ uint32 match1 = span->_matches[0];
+ uint32 match2 = span->_matches[1];
+
+ if (match1 >> COLORSHIFT) {
+ match1 = span->_matches[1];
+ match2 = span->_matches[0];
+ }
+
+ atacMatch *m1 = M1->getMatch(match1 & COLORMASK);
+ atacMatch *m2 = M2->getMatch(match2 & COLORMASK);
+
+ if (m1->iid2 == m2->iid2) {
+ uint32 off1 = span->_beg - m1->POS1;
+ uint32 pos1l = m1->POS2 + off1;
+ uint32 pos1r = m1->POS2 + m1->LEN2 - off1;
+
+ uint32 off2 = span->_beg - m2->POS1;
+ uint32 pos2l = m2->POS2 + off2;
+ uint32 pos2r = m2->POS2 + m2->LEN2 - off2;
+
+ if ((pos1l == pos2l) || (pos1r == pos2r)) {
+ stats.same += spanLen;
+ printAnno(outfile, AL, ALlen, 'Y', INDEX, span, match1, m1, match2, m2);
+ } else {
+ stats.different += spanLen;
+ printAnno(outfile, AL, ALlen, 'N', INDEX, span, match1, m1, match2, m2);
+ }
+ } else {
+ // Wildly different matches! Mapped to different scaffolds!
+ stats.wilddiff += spanLen;
+ printAnno(outfile, AL, ALlen, '!', INDEX, span, match1, m1, match2, m2);
+ }
+ } else {
+ stats.inconsistent += spanLen;
+ printAnno(outfile, AL, ALlen, '?', INDEX, span);
+ }
+
+ node = dict_next(S->_tree, node);
+ }
+}
+
diff --git a/atac-driver/alignOverlap/overlap-sort.C b/atac-driver/alignOverlap/overlap-sort.C
new file mode 100644
index 0000000..9e0190e
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap-sort.C
@@ -0,0 +1,65 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include "overlap.H"
+
+int
+sortMatches1(const void *a, const void *b) {
+ const atacMatch *A = *((const atacMatch * const *)a);
+ const atacMatch *B = *((const atacMatch * const *)b);
+
+ if (A->iid1 < B->iid1) return(-1);
+ if (A->iid1 > B->iid1) return(1);
+ if (A->pos1 < B->pos1) return(-1);
+ if (A->pos1 > B->pos1) return(1);
+ if (A->len1 > B->len1) return(-1);
+ if (A->len1 < B->len1) return(1);
+ if (A->fwd1 > B->fwd1) return(-1);
+ if (A->fwd1 < B->fwd1) return(1);
+ return(0);
+}
+
+int
+sortMatches2(const void *a, const void *b) {
+ const atacMatch *A = *((const atacMatch * const *)a);
+ const atacMatch *B = *((const atacMatch * const *)b);
+
+ if (A->iid2 < B->iid2) return(-1);
+ if (A->iid2 > B->iid2) return(1);
+ if (A->pos2 < B->pos2) return(-1);
+ if (A->pos2 > B->pos2) return(1);
+ if (A->len2 > B->len2) return(-1);
+ if (A->len2 < B->len2) return(1);
+ if (A->fwd2 > B->fwd2) return(-1);
+ if (A->fwd2 < B->fwd2) return(1);
+ return(0);
+}
+
+
+
+int
+spanCompare(const void *a, const void *b) {
+ const span_t *A = *((const span_t * const *)a);
+ const span_t *B = *((const span_t * const *)b);
+
+ if (A->_iid < B->_iid) return(-1);
+ if (A->_iid > B->_iid) return(1);
+ if (A->_beg < B->_beg) return(-1);
+ if (A->_beg > B->_beg) return(1);
+ return(0);
+}
diff --git a/atac-driver/alignOverlap/overlap-span.H b/atac-driver/alignOverlap/overlap-span.H
new file mode 100644
index 0000000..de241b6
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap-span.H
@@ -0,0 +1,94 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef OVERLAP_SPAN_H
+#define OVERLAP_SPAN_H
+
+#include "overlap.H"
+
+#define COLORSHIFT 24
+#define COLORMASK 0x00ffffff
+
+class span_t {
+public:
+ uint32 _iid;
+ uint32 _beg;
+ uint32 _end;
+ uint32 _matchesLen;
+ uint32 _matchesMax;
+ uint32 *_matches;
+
+ span_t(uint32 iid, uint32 beg, uint32 end) {
+ _iid = iid;
+ _beg = beg;
+ _end = end;
+ _matchesLen = 0;
+ _matchesMax = 0;
+ _matches = 0L;
+ };
+
+ ~span_t() {
+ delete [] _matches;
+ };
+
+ // The top X bits of the _matches is for storing the color. This
+ // does cut down the number of matches we can store. Human-Human
+ // is ~1 million matches.
+
+ void addMatch(uint32 matchiid, uint32 color) {
+ if (_matchesLen >= _matchesMax) {
+ if (_matchesMax == 0)
+ _matchesMax = 2;
+ _matchesMax *= 2;
+ uint32 *X = new uint32 [_matchesMax];
+ memcpy(X, _matches, sizeof(uint32) * _matchesLen);
+ delete [] _matches;
+ _matches = X;
+ }
+
+ if (matchiid >> COLORSHIFT)
+ fprintf(stderr, "ERROR! span_t::addMatch()-- match id too big, decrease the color space.\n"), exit(1);
+
+ _matches[_matchesLen++] = (color << COLORSHIFT) | (matchiid);
+ };
+
+ // Split this span at position, return two new spans
+ //
+ void split(uint32 position, span_t* &l, span_t* &r) {
+
+ if ((position < _beg) || (_end < position)) {
+ fprintf(stderr, "span_t::split()-- _beg="uint32FMT" _end="uint32FMT" postition="uint32FMT"?\n", _beg, _end, position);
+ exit(1);
+ }
+
+ l = new span_t(_iid, _beg, position);
+ r = new span_t(_iid, position, _end);
+
+ l->_matchesLen = _matchesLen;
+ l->_matchesMax = _matchesMax;
+ l->_matches = new uint32 [_matchesMax];
+ memcpy(l->_matches, _matches, sizeof(uint32) * _matchesLen);
+
+ r->_matchesLen = _matchesLen;
+ r->_matchesMax = _matchesMax;
+ r->_matches = new uint32 [_matchesMax];
+ memcpy(r->_matches, _matches, sizeof(uint32) * _matchesLen);
+ };
+};
+
+#endif // OPERLAP_SPAN_H
diff --git a/atac-driver/alignOverlap/overlap-spanTree.H b/atac-driver/alignOverlap/overlap-spanTree.H
new file mode 100644
index 0000000..01c18b2
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap-spanTree.H
@@ -0,0 +1,125 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef OVERLAP_SPANTREE_H
+#define OVERLAP_SPANTREE_H
+
+#include "overlap.H"
+
+class spanTree {
+public:
+ spanTree() {
+ _tree = dict_create(DICTCOUNT_T_MAX, spanCompare);
+ };
+ ~spanTree() {
+ dict_free_nodes(_tree);
+ dict_free(_tree);
+ };
+
+ void addNewSpan(uint32 iid, uint32 len) {
+ span_t *span = new span_t(iid, 0, len);
+ dict_alloc_insert(_tree, span, 0L);
+ };
+
+ uint32 size(void) {
+ return((uint32)dict_count(_tree));
+ };
+
+ void addMatch(atacMatch *match, uint32 side, uint32 color) {
+
+ // Query the tree for the first match before this position.
+ // We're guaranteed to find one before, since the tree was
+ // initialized with a span for the whole sequence.
+ //
+ span_t *span = 0L;
+ uint32 beg = 0;
+ uint32 end = 0;
+
+ if (side == 0) {
+ span = new span_t(match->iid1, match->pos1, match->pos1 + match->len1);
+ beg = match->pos1;
+ end = match->pos1 + match->len1;
+ } else {
+ span = new span_t(match->iid2, match->pos2, match->pos2 + match->len2);
+ beg = match->pos2;
+ end = match->pos2 + match->len2;
+ }
+
+ dnode_t *node = dict_upper_bound(_tree, span);
+ delete span;
+ span = (span_t *)dnode_getkey(node);
+
+ // We need to split the span pointed to by node, iterate through
+ // all the spans, and split the last one.
+
+ if (span->_beg != beg) {
+ span_t *l = 0L;
+ span_t *r = 0L;
+ span->split(beg, l, r);
+
+ // Kill this node, insert the new ones
+
+ dict_delete(_tree, node);
+ dnode_destroy(node);
+ dict_alloc_insert(_tree, l, 0L);
+ dict_alloc_insert(_tree, r, 0L);
+
+ delete span;
+ span = r;
+
+ // Argh! Now find the node we just inserted...
+
+ node = dict_lookup(_tree, r);
+ }
+
+ // Until we hit the last span, add the match to the span
+
+ while (span->_end < end) {
+ span->addMatch(match->matchiid, color);
+ node = dict_next(_tree, node);
+ span = (span_t *)dnode_getkey(node);
+ }
+
+ // We're at the last span, so split it like the beginning
+
+ if (span->_end != end) {
+ span_t *l = 0L;
+ span_t *r = 0L;
+ span->split(end, l, r);
+
+ // Kill this node, insert the new ones
+
+ dict_delete(_tree, node);
+ dnode_destroy(node);
+ dict_alloc_insert(_tree, l, 0L);
+ dict_alloc_insert(_tree, r, 0L);
+
+ delete span;
+ span = l;
+ }
+
+ // FInally, add the match to the last span
+
+ span->addMatch(match->matchiid, color);
+ };
+
+ dict_t *_tree;
+ dict_load_t _load;
+};
+
+#endif // OVERLAP_SPANTREE_H
diff --git a/atac-driver/alignOverlap/overlap-stats.H b/atac-driver/alignOverlap/overlap-stats.H
new file mode 100644
index 0000000..5926d87
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap-stats.H
@@ -0,0 +1,96 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef OVERLAP_STATS_H
+#define OVERLAP_STATS_H
+
+#include "overlap.H"
+
+// Statistics and Histograms
+//
+// Index 1 is the assembly, 2 is the mapping. Stats count the
+// number of bases covered, histograms are of the block sizes.
+//
+class histogram {
+public:
+ histogram(uint32 max=65536) {
+ histMax = max;
+ hist = new uint32 [histMax];
+ sum = 0;
+
+ for (uint32 i=0; i<histMax; i++)
+ hist[i] = 0;
+ };
+ ~histogram() {
+ delete [] hist;
+ };
+
+ void update(uint32 length) {
+ sum += length;
+ hist[ ((length) > histMax) ? 0 : length ]++;
+ };
+
+ void operator+=(uint32 length) {
+ update(length);
+ };
+
+ uint32 getSum(void) {
+ return(sum);
+ };
+
+ void writeHistogram(char const *prefix, char const *label) {
+ char filename[1024];
+ sprintf(filename, "%s.%s", prefix, label);
+ errno = 0;
+ FILE *out = fopen(filename, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", filename, strerror(errno)), exit(1);
+ for (uint32 i=0; i<histMax; i++)
+ fprintf(out, uint32FMT" "uint32FMT"\n", i, hist[i]);
+ fclose(out);
+ };
+
+private:
+ uint32 *hist;
+ uint32 histMax;
+ uint32 sum;
+};
+
+class overlapStats {
+public:
+ histogram unmapped;
+ histogram map1unique;
+ histogram map2unique;
+ histogram different;
+ histogram wilddiff;
+ histogram same;
+ histogram inconsistent;
+
+ void writeHistogram(char const *prefix) {
+ unmapped.writeHistogram(prefix, "unmapped");
+ map1unique.writeHistogram(prefix, "map1unique");
+ map2unique.writeHistogram(prefix, "map2unique");
+ different.writeHistogram(prefix, "different");
+ wilddiff.writeHistogram(prefix, "wilddiff");
+ same.writeHistogram(prefix, "same");
+ inconsistent.writeHistogram(prefix, "inconsistent");
+ };
+};
+
+
+#endif // OVERLAP_STATS_H
diff --git a/atac-driver/alignOverlap/overlap.C b/atac-driver/alignOverlap/overlap.C
new file mode 100644
index 0000000..6b5e244
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap.C
@@ -0,0 +1,181 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include "overlap.H"
+
+
+int
+main(int argc, char **argv) {
+
+ if (argc != 4) {
+ fprintf(stderr, "usage: %s <matches-1> <matches-2> <out-prefix>\n", argv[0]);
+ exit(1);
+ }
+ atacFile *AF1 = new atacFile(argv[1]);
+ atacFile *AF2 = new atacFile(argv[2]);
+
+ atacMatchList *M1 = AF1->matches();
+ atacMatchList *M2 = AF2->matches();
+
+ char *OP = argv[3];
+
+
+ // We want to annotate the two assembies with:
+ // a) mapped by both, the same
+ // b) mapped by both, differently
+ // c) mapped by the first, unmapped by the second
+ // d) mapped by the second, unmapped by the first
+ // e) unmapped by both
+ //
+ // If unmapped, we could further annotate with the reason it was
+ // unmapped -- not found, or found multiple times.
+ //
+ // Our annotation datastructure is a tree of spans. Each span is a
+ // sequence, and an interval on that sequence. We assume that the
+ // tree contains the spans for the whole sequence, that is, that we
+ // never need to increase a span, just split.
+ //
+ spanTree *S1 = new spanTree();
+ spanTree *S2 = new spanTree();
+
+ // Initialize the tree of spans by inserting a single span for each
+ // sequence in the file.
+ //
+ for (uint32 i=0; i<AF1->fastaA()->getNumberOfSequences(); i++)
+ S1->addNewSpan(i, AF1->fastaA()->getSequenceLength(i));
+ for (uint32 i=0; i<AF1->fastaB()->getNumberOfSequences(); i++)
+ S2->addNewSpan(i, AF1->fastaB()->getSequenceLength(i));
+
+ // Add every match to the spanTrees.
+
+ for (uint32 i=0; i<M1->numberOfMatches(); i++) {
+ S1->addMatch(M1->getMatch(i), 0, 0);
+ S2->addMatch(M1->getMatch(i), 1, 0);
+ }
+ for (uint32 i=0; i<M2->numberOfMatches(); i++) {
+ S1->addMatch(M2->getMatch(i), 0, 1);
+ S2->addMatch(M2->getMatch(i), 1, 1);
+ }
+
+ // Dump each spanTree: For each span, we need to check that
+ // it has matches?
+ // only one match, or only matches from one mapping?
+ // matches from both mappings? need to check that
+ // the span in the other tree also has the same matches
+ //
+ // Doesn't handle weird stuff like this span (on sequence 1)
+ // mapping onto seq2 correctly, but the span in seq2 having an
+ // extra match to somewhere else in seq1.
+ //
+ // we want to find the single span in the other spanTree that
+ // corresponds to this span. once we do that, we can verify that
+ // all the matches are the same.
+ //
+ // because we are gapless matches, we can, for each match,
+ // compute the exact location this span should occur on the other
+ // sequence. then, do a lookup() to get that span, or just
+ // verify that everybody is the same location.
+
+ char outname[1024];
+ FILE *outfile;
+
+ overlapStats statsA;
+ uint32 ALmax = (uint32)dict_count(S1->_tree);
+ uint32 ALlen = 0;
+ annoList *AL = new annoList [ ALmax ];
+
+ sprintf(outname, "%s.map1annotation", OP);
+ errno = 0;
+ outfile = fopen(outname, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", outname, strerror(errno));
+ process1(outfile, S1, M1, M2, statsA, AL, ALlen, ALmax);
+ fclose(outfile);
+
+ overlapStats statsB;
+ uint32 BLmax = (uint32)dict_count(S1->_tree);
+ uint32 BLlen = 0;
+ annoList *BL = new annoList [ ALmax ];
+
+ sprintf(outname, "%s.map2annotation", OP);
+ errno = 0;
+ outfile = fopen(outname, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", outname, strerror(errno));
+ process2(outfile, S2, M1, M2, statsB, BL, BLlen, BLmax);
+ fclose(outfile);
+
+ fprintf(stderr, "unmapped: A:"uint32FMTW(10)" B:"uint32FMTW(10)"\n", statsA.unmapped.getSum(), statsB.unmapped.getSum());
+ fprintf(stderr, "unique mapping 1: A:"uint32FMTW(10)" B:"uint32FMTW(10)"\n", statsA.map1unique.getSum(), statsB.map1unique.getSum());
+ fprintf(stderr, "unique mapping 2: A:"uint32FMTW(10)" B:"uint32FMTW(10)"\n", statsA.map2unique.getSum(), statsB.map2unique.getSum());
+ fprintf(stderr, "different: A:"uint32FMTW(10)" B:"uint32FMTW(10)"\n", statsA.different.getSum(), statsB.different.getSum());
+ fprintf(stderr, "wild diff: A:"uint32FMTW(10)" B:"uint32FMTW(10)"\n", statsA.wilddiff.getSum(), statsB.wilddiff.getSum());
+ fprintf(stderr, "same: A:"uint32FMTW(10)" B:"uint32FMTW(10)"\n", statsA.same.getSum(), statsB.same.getSum());
+ fprintf(stderr, "inconsistent: A:"uint32FMTW(10)" B:"uint32FMTW(10)"\n", statsA.inconsistent.getSum(), statsB.inconsistent.getSum());
+
+ // Dump the histograms for each of the labelings
+ //
+ sprintf(outname, "%s.asm1histogram", OP);
+ statsA.writeHistogram(outname);
+ sprintf(outname, "%s.asm2histogram", OP);
+ statsB.writeHistogram(outname);
+
+ // Draw some pretty pictures
+ //
+ sprintf(outname, "%s.histogram.gnuplot", OP);
+ errno = 0;
+ outfile = fopen(outname, "w");
+ if (errno)
+ fprintf(stderr, "failed to open '%s': %s\n", outname, strerror(errno)), exit(1);
+ fprintf(outfile, "set terminal postscript color\n");
+ fprintf(outfile, "set output \"%s.unmapped.histogram.ps\"\n", OP);
+ fprintf(outfile, "set ylabel \"number of regions\"\n");
+ fprintf(outfile, "set xlabel \"length of region\"\n");
+ fprintf(outfile, "plot [0:10000][0:400] \\\n");
+ fprintf(outfile, " \"%s.asm1histogram.unmapped\" using 2 title \"assembly 1 unmapped\" with lines, \\\n", OP);
+ fprintf(outfile, " \"%s.asm2histogram.unmapped\" using 2 title \"assembly 2 unmapped\" with lines\n", OP);
+ fprintf(outfile, "set output \"%s.same.histogram.ps\"\n", OP);
+ fprintf(outfile, "plot [0:20000][0:2000] \\\n");
+ fprintf(outfile, " \"%s.asm1histogram.same\" using 2 title \"assembly 1 same\" with lines, \\\n", OP);
+ fprintf(outfile, " \"%s.asm2histogram.same\" using 2 title \"assembly 2 same\" with lines\n", OP);
+ fprintf(outfile, "set output \"%s.histogram.ps\"\n", OP);
+ fprintf(outfile, "plot [0:2000][0:100] \\\n");
+ fprintf(outfile, " \"%s.asm1histogram.different\" using 2 title \"assembly 1 different\" with lines, \\\n", OP);
+ fprintf(outfile, " \"%s.asm2histogram.different\" using 2 title \"assembly 2 different\" with lines, \\\n", OP);
+ fprintf(outfile, " \"%s.asm1histogram.wilddiff\" using 2 title \"assembly 1 wildly diff\" with lines, \\\n", OP);
+ fprintf(outfile, " \"%s.asm2histogram.wilddiff\" using 2 title \"assembly 2 wildly diff\" with lines\n", OP);
+ fprintf(outfile, "set output \"%s.unique.histogram.ps\"\n", OP);
+ fprintf(outfile, "plot [0:2000][0:100] \\\n");
+ fprintf(outfile, " \"%s.asm1histogram.map1unique\" using 2 title \"map 1, assembly 1 unique\" with lines, \\\n", OP);
+ fprintf(outfile, " \"%s.asm1histogram.map2unique\" using 2 title \"map 2, assembly 1 unique\" with lines, \\\n", OP);
+ fprintf(outfile, " \"%s.asm2histogram.map1unique\" using 2 title \"map 1, assembly 2 unique\" with lines, \\\n", OP);
+ fprintf(outfile, " \"%s.asm2histogram.map2unique\" using 2 title \"map 2, assembly 2 unique\" with lines\n", OP);
+ fclose(outfile);
+
+ sprintf(outname, "gnuplot < %s.histogram.gnuplot", OP);
+ if (system(outname))
+ fprintf(stderr, "Failed to '%s'\n", outname);
+
+#if 0
+ findIsolatedUnique(AL, ALlen);
+ findExtended(AL, ALlen);
+#endif
+
+ // Deleting the spanTrees takes a long time, so we don't bother with any cleanup.
+ return(0);
+}
diff --git a/atac-driver/alignOverlap/overlap.H b/atac-driver/alignOverlap/overlap.H
new file mode 100644
index 0000000..c378eb1
--- /dev/null
+++ b/atac-driver/alignOverlap/overlap.H
@@ -0,0 +1,82 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef OVERLAP_H
+#define OVERLAP_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "bio++.H"
+#include "util++.H"
+#include "atac.H"
+
+// Kaz Kylheku <kaz at ashi.footprints.net> library.
+#include "kazlib/dict.h"
+#include "kazlib/except.h"
+#include "kazlib/hash.h"
+#include "kazlib/list.h"
+#include "kazlib/sfx.h"
+
+int sortMatches1(const void *a, const void *b);
+int sortMatches2(const void *a, const void *b);
+int spanCompare(const void *a, const void *b);
+
+#include "overlap-span.H"
+#include "overlap-matchTree.H"
+#include "overlap-spanTree.H"
+#include "overlap-annoList.H"
+#include "overlap-stats.H"
+
+void
+process1(FILE *outfile,
+ spanTree *S,
+ atacMatchList *M1,
+ atacMatchList *M2,
+ overlapStats &stats,
+ annoList *AL,
+ uint32 &ALlen,
+ uint32 &ALmax);
+
+void
+process2(FILE *outfile,
+ spanTree *S,
+ atacMatchList *M1,
+ atacMatchList *M2,
+ overlapStats &stats,
+ annoList *AL,
+ uint32 &ALlen,
+ uint32 &ALmax);
+
+void
+printAnno(FILE *F, annoList *AL, uint32 &ALlen,
+ char label,
+ uint32 axis,
+ span_t *span,
+ uint32 match1=uint32ZERO, atacMatch *m1=0L,
+ uint32 match2=uint32ZERO, atacMatch *m2=0L);
+
+void
+findIsolatedUnique(annoList *AL, uint32 ALlen);
+
+void
+findExtended(annoList *AL, uint32 ALlen);
+
+
+#endif // OVERLAP_H
diff --git a/atac-driver/alignOverlap/summarizeDisagree.pl b/atac-driver/alignOverlap/summarizeDisagree.pl
new file mode 100644
index 0000000..c66d603
--- /dev/null
+++ b/atac-driver/alignOverlap/summarizeDisagree.pl
@@ -0,0 +1,100 @@
+#!/usr/bin/perl
+
+use strict;
+
+# Computes the number and cumulative length of regions where two atac
+# mappings disagree.
+#
+# Reports when the region maps to:
+# small -- same sequence, close together
+# large -- same scaffold, not close together
+# major -- different scaffold
+#
+# Automagically generates a plot
+#
+
+if (scalar(@ARGV != 2)) {
+ print STDERR "usage: $0 some.atac outprefix\n";
+ exit(1);
+}
+
+my $filename = shift @ARGV;
+my $outprefix = shift @ARGV;
+my $smallLimit = 400;
+
+my @smallH;
+my @smallHLen;
+my $large = 0;
+my $largeLen = 0;
+my $major = 0;
+my $majorLen = 0;
+
+open(F, "< $filename") or die "Failed to open $filename.\n";
+while (<F>) {
+ if (m/^[N!]\s+(\d+):(\d+)-(\d+)\[\s*\d+\].*\s(\d+):\s*(\d+)-\s*(\d+)\).*\s(\d+):\s*(\d+)-\s*(\d+)\)/) {
+ my ($id1, $b1, $e1) = ($1, $2, $3);
+ my ($id2a, $b2a, $e2a) = ($4, $5, $6);
+ my ($id2b, $b2b, $e2b) = ($7, $8, $9);
+
+ if ($id2a == $id2b) {
+ my $diff;
+ $diff = $b2b - $b2a;
+ $diff = $b2a - $b2b if ($b2a > $b2b);
+
+ if ($diff < $smallLimit) {
+ $smallH[$diff]++;
+ $smallHLen[$diff] += $e1 - $b1;
+ } else {
+ $large++;
+ $largeLen += $e1 - $b1;
+ }
+ } else {
+ $major++;
+ $majorLen += $e1 - $b1;
+ }
+ }
+}
+close(F);
+
+# output is
+#
+# distance away
+# number of regions
+# number of bp in those regions
+# cumulative number of regions
+# cumulative number of bp in those regions
+#
+
+
+my $sumH = 0;
+my $sumHLen = 0;
+open(F, "> $outprefix.dat");
+for (my $i=1; $i<$smallLimit; $i++) {
+ $sumH += $smallH[$i];
+ $sumHLen += $smallHLen[$i] / 10;
+ print F "$i $smallH[$i] $smallHLen[$i] $sumH $sumHLen\n" if (defined($smallH[$i]));
+}
+close(F);
+
+print STDERR "at most $smallLimit bp away: $sumH regions $sumHLen bp\n";
+print STDERR "at least $smallLimit bp away: $large regions $largeLen bp\n";
+print STDERR "different sequence: $major regions $majorLen bp\n";
+
+open(F, "> $outprefix.gnuplot");
+print F "set terminal postscript color\n";
+print F "set output \"$outprefix.ps\"\n";
+print F "set xlabel \"bp Difference in Match Location\"\n";
+print F "set ylabel \"\"\n";
+print F "plot [][0:300000] \"$outprefix.dat\" using 2 with lines title \"Number of Regions\", \\\n";
+print F " \"$outprefix.dat\" using 3 with lines title \"bp in Regions\", \\\n";
+print F " \"$outprefix.dat\" using 4 with lines title \"Cumulative Number of Regions\", \\\n";
+print F " \"$outprefix.dat\" using 5 with lines title \"Cumulative bp in Regions / 10\"\n";
+print F "plot [0:100][0:300000] \"$outprefix.dat\" using 2 with lines title \"Number of Regions\", \\\n";
+print F " \"$outprefix.dat\" using 3 with lines title \"bp in Regions\", \\\n";
+print F " \"$outprefix.dat\" using 4 with lines title \"Cumulative Number of Regions\", \\\n";
+print F " \"$outprefix.dat\" using 5 with lines title \"Cumulative bp in Regions / 10\"\n";
+close(F);
+
+system("gnuplot < $outprefix.gnuplot");
+
+
diff --git a/atac-driver/atac.pl b/atac-driver/atac.pl
new file mode 100755
index 0000000..8368b01
--- /dev/null
+++ b/atac-driver/atac.pl
@@ -0,0 +1,912 @@
+#!/usr/bin/env perl
+#
+# This file is part of A2Amapper.
+# Copyright (c) 2005-2009 J. Craig Venter Institute
+# Author: Brian Walenz
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received (LICENSE.txt) a copy of the GNU General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+use strict;
+use FindBin;
+
+my $id1 = undef;
+my $seq1 = undef;
+my $id2 = undef;
+my $seq2 = undef;
+
+my $ATACdir = undef;
+my $GENOMEdir = "default"; # Location of genome assemblies
+my $MERYLdir = "default"; # Location of genome mercount databases
+
+my $BINdir = "$FindBin::Bin";
+my $LIBdir = "$FindBin::Bin/../lib";
+
+my $mersize = 20; # the mer size
+my $minfill = 20; # the mimimum fill for a reported match.
+my $merlimit = 1; # unique mers only
+my $maxgap = 0; # the maximum substitution gap
+
+# annotates the resulting atac file with parameters
+# for cross species, also sets match extender options
+my $crossSpecies = 0;
+
+my $matchExtenderOpts = "";
+
+my $filtername = "$LIBdir/filter-heavychains.so";
+my $filteropts = "-S 100 -J 100000";
+
+my $numSegments = 1; # More than one not supported in seatac; search for -use too
+my $numThreads = 4;
+
+my $merylThreads = 2;
+
+my $merylOnly = 0;
+
+# Check that we have everything we need to run
+#
+my $leaff = "$BINdir/leaff";
+my $meryl = "$BINdir/meryl";
+my $existDB = "$BINdir/existDB";
+my $seatac = "$BINdir/seatac";
+my $chainer = "$BINdir/AtacDriver.py";
+my $correctgaps = "$BINdir/correctGaps";
+my $statsgenerator = "$BINdir/statsGenerator";
+my $makeplot = "perl $BINdir/makeplot.pl";
+
+die "Can't run $leaff\n" if (! -x $leaff);
+die "Can't run $meryl\n" if (! -x $meryl);
+die "Can't run $existDB\n" if (! -x $existDB);
+die "Can't run $seatac\n" if (! -x $seatac);
+die "Can't find $chainer\n" if (! -e $chainer);
+die "Can't find $filtername\n" if (! -e $filtername);
+die "Can't run $correctgaps\n" if (! -x $correctgaps);
+die "Can't run $statsgenerator\n" if (! -x $statsgenerator);
+
+
+# Main begins here!
+#
+# We used to use a long descriptive name for the matches, which
+# encoded some parameters, but since we never really change those
+# parameters, we stop encoding.
+#
+# It used to be "${id1}vs${id2}.k$mersize.u$merlimit.f$minfill.g$maxgap"
+
+parseArgs();
+findSources();
+
+my $matches = "${id1}vs${id2}";
+
+if (! -e "$ATACdir/$matches.atac") {
+ my $mercount1 = countMers($id1, $mersize, $merlimit);
+ my $mercount2 = countMers($id2, $mersize, $merlimit);
+
+ buildMask($mercount1, $mercount2);
+
+ my @segmentIDs = findHits();
+
+ extendMatches(@segmentIDs);
+
+ makeChains();
+ closeGaps();
+ makeClumps();
+ generateStatistics();
+ rewriteUIDs("$ATACdir/$matches.atac");
+
+ print STDERR "\n";
+ print STDERR "Finished! Output is:\n";
+ print STDERR " matches and runs -- $ATACdir/$matches.atac\n";
+ print STDERR " clumps -- $ATACdir/$matches.*clump*.atac\n";
+}
+
+# Subroutines below!
+
+
+sub usage {
+ print STDERR "usage: $0 -dir AvsB -id1 A -seq1 A.fasta -id2 B -seq2 B.fasta -meryldir M [opts\n";
+ print STDERR "\n";
+ print STDERR "ATAC will compute and place results in the run-directory.\n";
+ print STDERR "The meryl directory is used to store assembly-specific\n";
+ print STDERR "intermediate files. Internally, atac uses an ID to refer to\n";
+ print STDERR "a assembly; if the same ID/seq pair is used across multiple\n";
+ print STDERR "runs, the assembly-specific intermediate files can be reused.\n";
+ print STDERR "\n";
+ print STDERR "A * indicates a required argument.\n";
+ print STDERR "\n";
+ print STDERR "* -dir run-directory -- path to the RESULTS directory\n";
+ print STDERR "* -meryldir path -- path to the MERYL directory\n";
+ print STDERR " -genomedir path -- path to the GENOMES directory\n";
+ print STDERR "\n";
+ print STDERR "* -id1 id1 -- ID of the A assembly\n";
+ print STDERR "* -seq1 seq1.fasta -- sequence of the A assembly\n";
+ print STDERR "* -id2 id2 -- ID of the B assembly\n";
+ print STDERR "* -seq2 seq2.fasta -- sequence of the B assembly\n";
+ print STDERR "\n";
+ print STDERR "NOTE: A hash table will be built for id1. For space and\n";
+ print STDERR " performance, this should usually be the smaller assembly.\n";
+ print STDERR "\n";
+ print STDERR "NOTE: It is generally assumed that id1 is the REFERENCE assembly.\n";
+ print STDERR "\n";
+ print STDERR " -numsegments s -- number of segments to do the search in\n";
+ print STDERR " (doubling segments halves memory usage)\n";
+ print STDERR " -numsegments NOT SUPPORTED; DO NOT USE\n";
+ print STDERR "\n";
+ print STDERR " -numthreads t -- number of threads to use per search\n";
+ print STDERR " (slight increase in memory usage)\n";
+ print STDERR "\n";
+ print STDERR " -merylonly -- only run the meryl components\n";
+ print STDERR " -merylthreads t -- number of threads to use for meryl\n";
+ print STDERR "\n";
+ print STDERR " -samespecies -- use magic values for same species\n";
+ print STDERR " -crossspecies -- use guesses for different species\n";
+ print STDERR "\n";
+ print STDERR " -segmentid x -- only run segment with id x\n";
+ print STDERR " (don't use)\n";
+ exit(1);
+}
+
+sub parseArgs {
+
+ while (scalar(@ARGV) > 0) {
+ my $arg = shift @ARGV;
+
+ if ($arg eq "-dir") {
+ $ATACdir = shift @ARGV;
+ } elsif ($arg eq "-id1") {
+ $id1 = shift @ARGV;
+ } elsif ($arg eq "-seq1") {
+ $seq1 = shift @ARGV;
+ } elsif ($arg eq "-id2") {
+ $id2 = shift @ARGV;
+ } elsif ($arg eq "-seq2") {
+ $seq2 = shift @ARGV;
+ } elsif ($arg eq "-genomedir") {
+ $GENOMEdir = shift @ARGV;
+ } elsif ($arg eq "-meryldir") {
+ $MERYLdir = shift @ARGV;
+ } elsif ($arg eq "-numsegments") {
+ $numSegments = shift @ARGV;
+ } elsif ($arg eq "-numthreads") {
+ $numThreads = shift @ARGV;
+ } elsif ($arg eq "-merylonly") {
+ $merylOnly = 1;
+ } elsif ($arg eq "-merylthreads") {
+ $merylThreads = shift @ARGV;
+ } elsif ($arg eq "-samespecies") {
+ $mersize = 20; # the mer size
+ $merlimit = 1; # unique mers only
+ $minfill = 20; # the mimimum fill for a reported match.
+ $maxgap = 0; # the maximum substitution gap
+ } elsif ($arg eq "-samespecies9") {
+ $mersize = 20; # the mer size
+ $merlimit = 9; # mostly unique mers only
+ $minfill = 20; # the mimimum fill for a reported match.
+ $maxgap = 0; # the maximum substitution gap
+ } elsif ($arg eq "-crossspecies20") {
+ $mersize = 20; # the mer size
+ $merlimit = 9; # mostly unique mers only
+ $minfill = 20; # the mimimum fill for a reported match.
+ $maxgap = 0; # the maximum substitution gap
+ $crossSpecies = 1; # extra parameters in the atac file
+ } elsif ($arg eq "-crossspecies") {
+ $mersize = 18; # the mer size
+ $merlimit = 9; # mostly unique mers only
+ $minfill = 18; # the mimimum fill for a reported match.
+ $maxgap = 0; # the maximum substitution gap
+ $crossSpecies = 1; # extra parameters in the atac file
+ } elsif($arg eq "-filtername") {
+ $filtername = shift @ARGV;
+ } elsif($arg eq "-filteropts") {
+ $filteropts = shift @ARGV;
+ } elsif ($arg eq "-mersize") {
+ $mersize = shift @ARGV;
+ $minfill = $mersize;
+ } elsif ($arg eq "-merlimit") {
+ $merlimit = shift @ARGV;
+ } elsif ($arg eq "-justtestingifitworks") {
+ exit(0);
+ } else {
+ die "unknown option $arg\n";
+ }
+ }
+
+ # Search for -use too.
+ die "-numsegments NOT SUPPORTED.\n" if ($numSegments != 1);
+
+ if (!defined($id1) ||
+ !defined($id2)) {
+ usage();
+ }
+
+ my $pwd = `pwd`;
+ $pwd =~ s/^\s+//;
+ $pwd =~ s/\s+$//;
+
+ $GENOMEdir = "$pwd/$GENOMEdir" if ($GENOMEdir !~ m!^/!);
+ $MERYLdir = "$pwd/$MERYLdir" if ($MERYLdir !~ m!^/!);
+ $ATACdir = "$pwd/$ATACdir" if ($ATACdir !~ m!^/!);
+
+ die "Unset GENOMEdir?'\n" if (! defined($GENOMEdir));
+ die "Unset MERYLdir?'\n" if (! defined($MERYLdir));
+ die "Unset ATACdir?'\n" if (! defined($ATACdir));
+
+ if (!defined($seq1) || (!defined($seq2))) {
+ die "Can't find the GENOMEdir '$GENOMEdir'\n" if (! -d $GENOMEdir);
+ }
+ if (defined($seq1)) {
+ $seq1 = "$pwd/$seq1" if ($seq1 !~ m!^/!);
+ }
+ if (defined($seq2)) {
+ $seq2 = "$pwd/$seq2" if ($seq2 !~ m!^/!);
+ }
+
+ system("mkdir $ATACdir") if (! -d "$ATACdir");
+ system("mkdir $ATACdir/work") if (! -d "$ATACdir/work");
+ system("mkdir $ATACdir/stats") if (! -d "$ATACdir/stats");
+ system("mkdir $MERYLdir") if (! -d "$MERYLdir");
+}
+
+
+# Read the nickname file, set up symlinks to the data sources
+#
+sub findSources {
+ my %GENOMEaliases;
+
+ # Read all the *.atai files in the genome directory, save only
+ # those nicknames that have actual files associated with them.
+ # This lets us have multiple collections of assemblies, and also
+ # lets us move the directory around (e.g., for running on a
+ # laptop).
+ #
+ if (-d $GENOMEdir) {
+ # What? No GENOMEdir? The main already checked that we know both
+ # sequence files. Plus, we'd just fail below.
+
+ open(A, "ls $GENOMEdir |");
+ while (<A>) {
+ chomp;
+ if (m/\.atai$/) {
+ my $ataifile = "$GENOMEdir/$_";
+ open(F, "< $ataifile") or die "Can't open '$ataifile'\n";
+ while (<F>) {
+ chomp;
+
+ if (m/^!\s*format\s+atac\s+(.*)$/) {
+ print STDERR "Found format $1\n";
+ } elsif (m/^S\s+(\S+)\s+(\S+)$/) {
+ if (-e $2) {
+ $GENOMEaliases{$1} = $2;
+ } else {
+ print STDERR "WARNING: File '$2' not found for alias '$1'.\n";
+ }
+ } else {
+ #die "Error parsing genome description.\n '$_'\n";
+ }
+ }
+ close(F);
+ }
+ }
+ }
+ close(A);
+
+ # If the user gave both an id and a sequence, make sure that
+ # the id is distinct.
+ #
+ die "No id1 supplied!\n" if (!defined($id1));
+ die "No id2 supplied!\n" if (!defined($id2));
+
+ die "id1 = '$id1' is already used by sequence '$GENOMEaliases{$id1}'\n" if (defined($GENOMEaliases{$id1}) && defined($seq1));
+ die "id2 = '$id2' is already used by sequence '$GENOMEaliases{$id2}'\n" if (defined($GENOMEaliases{$id2}) && defined($seq2));
+
+ $seq1 = $GENOMEaliases{$id1} if (!defined($seq1));
+ $seq2 = $GENOMEaliases{$id2} if (!defined($seq2));
+
+ die "Unknown alias $id1.\n" if (!defined($seq1));
+ die "Unknown alias $id2.\n" if (!defined($seq2));
+
+ die "File '$seq1' doesn't exist for alias $id1.\n" if (! -e $seq1);
+ die "File '$seq2' doesn't exist for alias $id2.\n" if (! -e $seq2);
+
+ system("ln -s $seq1 $MERYLdir/$id1.fasta") if (! -e "$MERYLdir/$id1.fasta");
+ system("ln -s $seq2 $MERYLdir/$id2.fasta") if (! -e "$MERYLdir/$id2.fasta");
+
+ system("ln -s ${seq1}idx $MERYLdir/$id1.fastaidx") if (! -e "$MERYLdir/$id1.fastaidx") && (-e "${seq1}idx");
+ system("ln -s ${seq2}idx $MERYLdir/$id2.fastaidx") if (! -e "$MERYLdir/$id2.fastaidx") && (-e "${seq2}idx");
+}
+
+
+# Check that meryl is finished for each of the inputs
+#
+sub countMers {
+ my ($id, $mersize, $merlimit) = @_;
+
+ # Using "-H 32" is needed if the two sequences aren't about the
+ # same order of magnitude in size. This value is appropriate for
+ # sequences that are genome size.
+
+ if (! -e "$MERYLdir/$id.ms$mersize.mcdat") {
+ my $cmd;
+ $cmd = "$meryl -B -C ";
+ $cmd .= "-threads $merylThreads ";
+ $cmd .= "-m $mersize ";
+ $cmd .= "-s $MERYLdir/$id.fasta ";
+ $cmd .= "-o $MERYLdir/$id.ms$mersize ";
+ #die "why rebuild $MERYLdir/$id.ms$mersize.mcdat\n";
+ if (runCommand($cmd)) {
+ unlink "$MERYLdir/$id.ms$mersize.mcidx";
+ unlink "$MERYLdir/$id.ms$mersize.mcdat";
+ die "Failed to count mers in $id\n";
+ }
+ }
+
+ if (! -e "$MERYLdir/$id.ms$mersize.le$merlimit.mcdat") {
+ my $cmd;
+ $cmd = "$meryl -v ";
+ $cmd .= "-M lessthanorequal $merlimit ";
+ $cmd .= "-s $MERYLdir/$id.ms$mersize ";
+ $cmd .= "-o $MERYLdir/$id.ms$mersize.le$merlimit ";
+ #die "why rebuild $MERYLdir/$id.ms$mersize.le$merlimit.mcdat\n";
+ if (runCommand($cmd)) {
+ unlink "$MERYLdir/$id.ms$mersize.le$merlimit.mcidx";
+ unlink "$MERYLdir/$id.ms$mersize.le$merlimit.mcdat";
+ die "Failed to count mers lessthanorequal $merlimit in $id\n";
+ }
+ }
+
+ return "$id.ms$mersize.le$merlimit";
+}
+
+
+# Return the number of mers in a meryl file.
+#
+sub numberOfMers ($) {
+ my $mers = 0;
+ open(F, "$meryl -Dc -s $_[0] |");
+ while (<F>) {
+ $mers = $1 if (m/Found\s(\d+)\smers/);
+ }
+ close(F);
+ print STDERR "$_[0] has $mers mers.\n";
+ return($mers);
+}
+
+
+
+
+sub buildMask ($$) {
+ my $mercount1 = shift @_;
+ my $mercount2 = shift @_;
+
+ return if (-e "$ATACdir/work/$matches.mask.done");
+
+ my $minFile="min.$mercount1.$mercount2";
+
+ # $mercount1 and $mercount2 are the mers we want to use for
+ # searching. Obviously, only in-common mers can be found, we
+ # make a file of those mers here.
+
+ if (! -e "$ATACdir/work/$minFile.mcdat") {
+ print STDERR "Finding the min count between $mercount1 and $mercount2.\n";
+
+ my $cmd;
+ $cmd = "$meryl ";
+ $cmd .= "-M min ";
+ $cmd .= "-s $MERYLdir/$mercount1 ";
+ $cmd .= "-s $MERYLdir/$mercount2 ";
+ $cmd .= "-o $ATACdir/work/$minFile ";
+
+ if (runCommand($cmd)) {
+ unlink "$ATACdir/work/$minFile.mcidx";
+ unlink "$ATACdir/work/$minFile.mcdat";
+ die "Failed to find the min count between $mercount1 and $mercount2\n";
+ }
+ }
+
+ die "Failed to make the mask?\n" if (! -e "$ATACdir/work/$minFile.mcdat");
+
+ # From that list of in-common mers (in-common and below some
+ # count) we want to make a list of the mers that can be used in
+ # the search table. We can either make a positive (use these
+ # mers) or negative (don't use these mers) list, we just want to
+ # pick the smaller of the two.
+ #
+ #
+ # The positive 'include' list is just the 'min' mers found above.
+ #
+ # The negative 'exclude' list is the min mers, removed from the mers in id1.
+ #
+ my $includeSize = (-s "$ATACdir/work/$minFile.mcdat");
+ my $excludeSize = (-s "$MERYLdir/$id1.ms$mersize.mcdat") - (-s "$ATACdir/work/$minFile.mcdat");
+
+ print STDERR "includeSize is proportional to $includeSize.\n";
+ print STDERR "excludeSize is proportional to $excludeSize.\n";
+
+ # But this sometimes breaks (if the mcidx files are different sizes), so we now
+ # pay the cost of actually counting the number of mers.
+ #
+ $includeSize = numberOfMers("$ATACdir/work/$minFile");
+ $excludeSize = numberOfMers("$MERYLdir/$id1.ms$mersize") - $includeSize;
+
+ print STDERR "includeSize is exactly $includeSize mers.\n";
+ print STDERR "excludeSize is exactly $excludeSize mers.\n";
+
+ if ($includeSize < $excludeSize) {
+ rename "$ATACdir/work/$minFile.mcidx", "$ATACdir/work/$matches.include.mcidx";
+ rename "$ATACdir/work/$minFile.mcdat", "$ATACdir/work/$matches.include.mcdat";
+ } else {
+ if (! -e "$ATACdir/work/$matches.exclude.mcdat") {
+ print STDERR "Finding 'exclude' mers!\n";
+
+ # Our use of xor here is really just a subtraction. We
+ # want to report those mers that are only in the first
+ # file, not in the second. All mers in the second file
+ # should be in the first file, by construction.
+
+ my $cmd;
+ $cmd = "$meryl ";
+ $cmd .= "-M xor ";
+ $cmd .= "-s $MERYLdir/$id1.ms$mersize ";
+ $cmd .= "-s $ATACdir/work/$minFile ";
+ $cmd .= "-o $ATACdir/work/$matches.exclude ";
+
+ if (runCommand($cmd)) {
+ unlink "$ATACdir/work/$matches.exclude.mcidx";
+ unlink "$ATACdir/work/$matches.exclude.mcdat";
+ die "Failed to make exclude mers!\n";
+ }
+ }
+
+ if (-e "$ATACdir/work/$matches.exclude.mcdat") {
+ unlink "$ATACdir/work/$minFile.mcdat";
+ unlink "$ATACdir/work/$minFile.mcidx";
+ } else {
+ die "Failed to find exclude mers?\n";
+ }
+ }
+
+ # Success!
+ #
+ open(F, "> $ATACdir/work/$matches.mask.done");
+ close(F);
+
+ exit(0) if ($merylOnly == 1);
+}
+
+
+
+sub findHits {
+ my $segmentID = "000";
+ my @segmentIDs;
+
+ open(F, "$leaff --partitionmap $numSegments $MERYLdir/$id1.fasta |");
+ $numSegments = <F>;
+ while(<F>) {
+ my $segments = "";
+ my @pieces = split '\s+', $_;
+ my $memory = shift @pieces;
+
+ foreach my $piece (@pieces) {
+ if ($piece =~ m/(\d+)\(\d+\)/) {
+ $segments .= "$1\n";
+ } else {
+ die "Error parsing segment: $piece\n";
+ }
+ }
+
+ open(S, "> $ATACdir/work/$matches-segment-$segmentID");
+ print S $segments;
+ close(S);
+
+ push @segmentIDs, $segmentID;
+
+ $segmentID++;
+ }
+ close(F);
+
+ die "No segments found?\n" if (scalar(@segmentIDs) == 0);
+
+ #
+ # Now, for each segment that hasn't run, run it.
+ #
+
+ foreach my $segmentID (@segmentIDs) {
+
+ # For large runs, while developing, we found it very useful
+ # to build the tables first, save them to disk, then do the
+ # compute. This is also mandatory if one wants to segment
+ # the other assembly to reduce the time each piece runs.
+ #
+ # However, doing so adds a lot of complexity to this script,
+ # and isn't terribly useful anymore.
+ #
+
+ my $cmd;
+ $cmd = "$seatac ";
+ $cmd .= "-verbose ";
+ $cmd .= "-mersize $mersize ";
+ $cmd .= "-minlength $minfill ";
+ $cmd .= "-maxgap $maxgap ";
+ $cmd .= "-numthreads $numThreads ";
+ $cmd .= "-table $MERYLdir/$id1.fasta ";
+ $cmd .= "-stream $MERYLdir/$id2.fasta ";
+ $cmd .= "-only $ATACdir/work/$matches.include " if (-e "$ATACdir/work/$matches.include.mcdat");
+ $cmd .= "-mask $ATACdir/work/$matches.exclude " if (-e "$ATACdir/work/$matches.exclude.mcdat");
+ # Until we fix the -use support in seatac.
+ #$cmd .= "-use $ATACdir/work/$matches-segment-$segmentID ";
+ $cmd .= "-output $ATACdir/work/$matches-segment-$segmentID.matches ";
+ $cmd .= "-filtername $filtername " if (defined($filtername));
+ $cmd .= "-filteropts \"-1 $id1 -2 $id2 $filteropts\" ";
+ $cmd .= "> $ATACdir/work/$matches-$segmentID.out 2>&1";
+
+ if (! -e "$ATACdir/work/$matches-segment-$segmentID.matches") {
+ if (runCommand($cmd)) {
+ unlink "$ATACdir/work/$matches-segment-$segmentID.matches.crash";
+ rename "$ATACdir/work/$matches-segment-$segmentID.matches", "$ATACdir/work/$matches-segment-$segmentID.matches.crash";
+ die "Failed to run $matches-$segmentID\n";
+ }
+ }
+ }
+
+ return(@segmentIDs);
+}
+
+
+
+sub extendMatches (@) {
+ my @segmentIDs = @_;
+
+ return if (-e "$ATACdir/work/$matches.matches.extended");
+
+ # Check that each search finished.
+ #
+ foreach my $segmentID (@segmentIDs) {
+ if (! -e "$ATACdir/work/$matches-segment-$segmentID.matches") {
+ die "$ATACdir/work/$matches-segment-$segmentID.matches failed to complete.\n";
+ }
+ }
+
+ if ($crossSpecies) {
+ $matchExtenderOpts = "-e 4 -b 5 -s 5 -i 0.70 -p 100 -d 25";
+ }
+
+ # Build the header file.
+ #
+ open(ATACFILE, "> $ATACdir/work/$matches.header") or die;
+ print ATACFILE "!format atac 1.0\n";
+ print ATACFILE "#\n";
+ print ATACFILE "# Legend:\n";
+ print ATACFILE "#\n";
+ print ATACFILE "# Field 0: the row class\n";
+ print ATACFILE "# Field 1: the match type u=ungapped, x=exact, ....\n";
+ print ATACFILE "# Field 2: the match instance index\n";
+ print ATACFILE "# Field 3: the parent index\n";
+ print ATACFILE "# Field 4: the FASTA sequence id in the first assembly\n";
+ print ATACFILE "# Field 5: the offset from the start of the sequence for the match\n";
+ print ATACFILE "# Field 6: the length of the match in the first assembly\n";
+ print ATACFILE "# Field 7: the orientation of the match sequence in the first assembly.\n";
+ print ATACFILE "# Field 8: the FASTA sequence id for the second assembly\n";
+ print ATACFILE "# Field 9: the offset from the start of the sequence for the match\n";
+ print ATACFILE "# Field 10: the length of the match in the second assembly\n";
+ print ATACFILE "# Field 11: the orientation of the match sequence in the second assembly.\n";
+ print ATACFILE "#\n";
+ print ATACFILE "/assemblyId1=$id1\n";
+ print ATACFILE "/assemblyId2=$id2\n";
+ print ATACFILE "/assemblyFile1=$MERYLdir/$id1.fasta\n";
+ print ATACFILE "/assemblyFile2=$MERYLdir/$id2.fasta\n";
+
+ # We used to trim off the fasta from the filename...why?
+ my $seq1trimmed = $seq1;
+ my $seq2trimmed = $seq2;
+ $seq1trimmed = $1 if ($seq1trimmed =~ m/(.*).fasta$/);
+ $seq2trimmed = $1 if ($seq2trimmed =~ m/(.*).fasta$/);
+
+ print ATACFILE "/rawMatchMerSize=$mersize\n";
+ print ATACFILE "/rawMatchMerMaxDegeneracy=$merlimit\n";
+ print ATACFILE "/rawMatchAllowedSubstutionBlockSize=$maxgap\n";
+ print ATACFILE "/rawMatchMinFillSize=$minfill\n";
+
+ print ATACFILE "/heavyChainsOn=1\n";
+ print ATACFILE "/heavyMaxJump=100000\n";
+ print ATACFILE "/heavyMinFill=100\n";
+
+ print ATACFILE "/matchExtenderOn=1\n";
+
+ print ATACFILE "/uniqueFilterOn=1\n";
+ print ATACFILE "/fillIntraRunGapsOn=1\n";
+
+ if ($crossSpecies){
+ # The non-default parameters for Mouse versus Rat.
+ print ATACFILE "/matchExtenderMinEndRunLen=4\n";
+ print ATACFILE "/matchExtenderMaxMMBlock=5\n";
+ print ATACFILE "/matchExtenderMinBlockSep=5\n";
+ print ATACFILE "/matchExtenderMinIdentity=0.7\n";
+ print ATACFILE "/matchExtenderMaxNbrSep=100\n";
+ print ATACFILE "/matchExtenderMaxNbrPathMM=25\n";
+ print ATACFILE "/globalMatchMinSize=20\n";
+ print ATACFILE "/fillIntraRunGapsErate=0.30\n";
+ }
+ close(ATACFILE);
+
+
+ # run matchExtender
+ #
+ my $cmd;
+ $cmd = "$BINdir/matchExtender $matchExtenderOpts ";
+ $cmd .= "$ATACdir/work/$matches.header ";
+ foreach my $segmentID (@segmentIDs) {
+ $cmd .= " $ATACdir/work/$matches-segment-$segmentID.matches";
+ }
+ $cmd .= " > $ATACdir/work/$matches.matches.extended";
+
+ if (runCommand($cmd)) {
+ rename "$ATACdir/work/$matches.matches.extended", "$ATACdir/work/$matches.matches.extended.FAILED";
+ die "Failed.\n";
+ }
+
+
+ # Copy all the matches to the matchExtender. We take the liberty
+ # of making new match uids for these, since seatac can't make
+ # unique ids if it is run in multiple passes.
+ #
+if (0) {
+ my $uid = "000000000";
+ my $comma = $,; $, = " ";
+ my $slash = $\; $\ = "\n";
+ foreach my $segmentID (@segmentIDs) {
+ open(MATCHES, "< $ATACdir/work/$matches-segment-$segmentID.matches") or die "Failed to open '$ATACdir/work/$matches-segment-$segmentID.matches'\n";
+ while (<MATCHES>) {
+ if (m/^M/) {
+ my @v = split '\s+', $_;
+ $v[2] = "m$uid";
+ undef $v[12];
+ undef $v[13];
+ undef $v[14];
+ undef $v[15];
+ print ATACFILE @v;
+ $uid++;
+ }
+ }
+ close(MATCHES);
+ }
+
+ $, = $comma;
+ $\ = $slash;
+}
+}
+
+
+
+sub makeChains {
+ return if (-e "$ATACdir/work/$matches.matches.extended.chained.atac");
+
+ if (!defined($ENV{"TMPDIR"})) {
+ print STDERR "WARNING: TMPDIR not set, defaulting to '$ATACdir'.\n";
+ $ENV{"TMPDIR"} = $ATACdir;
+ }
+
+ # Path to the python shared-objects (in lib) and the python scripts.
+ #
+ $ENV{'PYTHONPATH'} = "$LIBdir";
+
+ if (runCommand("python $chainer $ATACdir/work/$matches.matches.extended")) {
+ print STDERR "PYTHONPATH=$ENV{'PYTHONPATH'}\n";
+ die "Chainer failed.\n";
+ }
+
+}
+
+
+
+sub closeGaps {
+ return if (-e "$ATACdir/$matches.atac");
+
+ my $cmd;
+ $cmd = "$correctgaps ";
+ $cmd .= " -m $ATACdir/work/$matches.matches.extended.chained.atac ";
+ $cmd .= " -l $ATACdir/work/$matches.matches.extended.chained.gapsclosed.log";
+ $cmd .= " > $ATACdir/work/$matches.matches.extended.chained.gapsclosed.atac";
+
+ if (runCommand($cmd)) {
+ rename "$ATACdir/work/$matches.matches.extended.chained.gapsclosed.atac", "$ATACdir/work/$matches.matches.extended.chained.gapsclosed.FAILED";
+ die "Failed to close gaps!\n";
+ }
+
+ if (! -e "$ATACdir/work/$matches.atac") {
+ system("ln -s $ATACdir/work/$matches.matches.extended.chained.gapsclosed.atac $ATACdir/work/$matches.atac");
+ }
+ if (! -e "$ATACdir/$matches.atac") {
+ system("ln $ATACdir/work/$matches.matches.extended.chained.gapsclosed.atac $ATACdir/$matches.atac");
+ }
+}
+
+
+
+sub makeClumps {
+ my $cmd;
+ my $ref;
+ my $rid;
+ my $clumpCost = 5000;
+
+ $ref = 1;
+ $rid = $id1;
+ if (! -e "$ATACdir/$matches.ref=$rid.clumpCost=$clumpCost.atac") {
+ $cmd = "cd $ATACdir ";
+ $cmd .= "&& ";
+ $cmd .= "$BINdir/clumpMaker ";
+ $cmd .= " -c $clumpCost ";
+ $cmd .= " -$ref ";
+ $cmd .= " -f $ATACdir/$matches.atac ";
+ $cmd .= "> $ATACdir/$matches.ref=$rid.clumpCost=$clumpCost.atac";
+ if (runCommand($cmd)) {
+ rename "$ATACdir/$matches.ref=$rid.clumpCost=$clumpCost.atac", "$ATACdir/$matches.ref=$rid.clumpCost=$clumpCost.atac.FAILED";
+ die "Failed to make clumps!\n";
+ }
+ }
+ rewriteUIDs("$ATACdir/$matches.ref=$rid.clumpCost=$clumpCost.atac");
+
+ $ref = 2;
+ $rid = $id2;
+ if (! -e "$ATACdir/$matches.ref=$rid.clumpCost=$clumpCost.atac") {
+ $cmd = "cd $ATACdir ";
+ $cmd .= "&& ";
+ $cmd .= "$BINdir/clumpMaker ";
+ $cmd .= " -c $clumpCost ";
+ $cmd .= " -$ref ";
+ $cmd .= " -f $ATACdir/$matches.atac ";
+ $cmd .= "> $ATACdir/$matches.ref=$rid.clumpCost=$clumpCost.atac";
+ if (runCommand($cmd)) {
+ rename "$ATACdir/$matches.ref=$rid.clumpCost=$clumpCost.atac", "$ATACdir/$matches.ref=$rid.clumpCost=$clumpCost.atac";
+ die "Failed to make clumps!\n";
+ }
+ }
+ rewriteUIDs("$ATACdir/$matches.ref=$rid.clumpCost=$clumpCost.atac");
+}
+
+
+
+sub generateStatistics {
+ my $cmd;
+
+ if (! -e "$ATACdir/stats/$matches.stats") {
+ $cmd = "$statsgenerator ";
+ $cmd .= "-a $ATACdir/$matches.atac ";
+ $cmd .= "-p $ATACdir/stats/$matches ";
+ $cmd .= "-g A ";
+ $cmd .= "> $ATACdir/stats/$matches.stats";
+ if (runCommand($cmd)) {
+ rename "$ATACdir/stats/$matches.stats", "$ATACdir/stats/$matches.stats.FAILED";
+ die "Failed to ganerate statistics.\n";
+ }
+ }
+
+ if (! -e "$ATACdir/stats/$matches.matches.png") {
+ $cmd = "$makeplot u $ATACdir/$matches.atac $ATACdir/stats/$matches.matches.png";
+ if (runCommand($cmd)) {
+ unlink "$ATACdir/stats/$matches.matches.png";
+ unlink "$ATACdir/stats/$matches.matches.ps";
+ die "Failed to ganerate dot plots.\n";
+ }
+ }
+
+ if (! -e "$ATACdir/stats/$matches.runs.png") {
+ $cmd = "$makeplot r $ATACdir/$matches.atac $ATACdir/stats/$matches.runs.png";
+ if (runCommand($cmd)) {
+ unlink "$ATACdir/stats/$matches.runs.png";
+ unlink "$ATACdir/stats/$matches.runs.ps";
+ die "Failed to ganerate dot plots.\n";
+ }
+ }
+}
+
+
+
+# Reads an atac file with atac-format IDs, writes an atac file with
+# UIDs (the first word in the defline).
+sub rewriteUIDs ($) {
+ my $infile = shift @_;
+ my $otfile = "$infile.uids";
+
+ return if (-e "$infile.uids");
+
+ my $seqA;
+ my $tagA;
+ my %uidA;
+
+ my $seqB;
+ my $tagB;
+ my %uidB;
+
+ my $iid;
+
+ open(F, "< $infile") or die "Failed to open '$infile'\n";
+ while (!defined($seqA) || !defined($tagA) || !defined($seqB) || !defined($tagB)) {
+ $_ = <F>;
+ $seqA = $1 if (m/^\/assemblyFile1=(.*)$/);
+ $tagA = $1 if (m/^\/assemblyId1=(.*)$/);
+ $seqB = $1 if (m/^\/assemblyFile2=(.*)$/);
+ $tagB = $1 if (m/^\/assemblyId2=(.*)$/);
+ }
+ close(F);
+
+ if (!defined($seqA) || !defined($tagA) || !defined($seqB) || !defined($tagB)) {
+ die "Something fishy. Didn't find seqs or tags in '$infile'.\n";
+ }
+
+ $iid = 0;
+ open(F, "< $seqA") or die "Failed to open '$seqA'\n";
+ while (<F>) {
+ if (m/^>(\S+)\s*.*$/) {
+ #chomp;
+ #print STDERR "$tagA:$iid -> $_\n";
+ $uidA{"$tagA:$iid"} = $1;
+ $iid++;
+ }
+ }
+ close(F);
+
+ $iid = 0;
+ open(F, "< $seqB") or die "Failed to open '$seqA'\n";
+ while (<F>) {
+ if (m/^>(\S+)\s*.*$/) {
+ #chomp;
+ #print STDERR "$tagB:$iid -> $_\n";
+ $uidB{"$tagB:$iid"} = $1;
+ $iid++;
+ }
+ }
+ close(F);
+
+ open(F, "< $infile") or die;
+ open(G, "> $otfile") or die;
+ while (<F>) {
+ chomp $_;
+
+ my @v = split '\s+', $_;
+
+ if (m/^M/) {
+ die "Didn't find uidA for $v[4]\n" if (!defined($uidA{$v[4]}));
+ die "Didn't find uidB for $v[8]\n" if (!defined($uidB{$v[8]}));
+
+ $v[4] = $uidA{$v[4]};
+ $v[8] = $uidB{$v[8]};
+
+ $_ = join ' ', @v;
+ }
+
+ print G "$_\n";
+ }
+ close(G);
+ close(F);
+}
+
+
+
+
+
+
+
+
+# Utility to run a command and check the exit status. We used to try
+# to decode the exit status...sigh.
+#
+sub runCommand {
+ my $cmd = shift @_;
+
+ print STDERR "\n$cmd\n\n";
+
+ if (system($cmd)) {
+ return(1);
+ }
+ return(0);
+}
diff --git a/atac-driver/chainer/Make.include b/atac-driver/chainer/Make.include
new file mode 100644
index 0000000..bf3f1e2
--- /dev/null
+++ b/atac-driver/chainer/Make.include
@@ -0,0 +1,56 @@
+# -*- makefile -*-
+
+$/.CXX_SRCS := $/localalign/GF_ALN_dpaligner.C \
+ $/localalign/GF_ALN_local.C \
+ $/localalign/GF_ALN_overlap.C \
+ $/localalign/GF_ALN_loverlapper.C \
+ $/localalign/GF_ALN_pieceOlap.C \
+ $/localalign/localAlignerInterfacemodule.C \
+ $/halign/halign.C \
+ $/halign/halignmodule.C
+
+$/.CXX_SHLIBS := $/localAlignerInterfacemodule.so \
+ $/halignmodule.so
+
+$/.PY_EXES := $/python/AtacDriver.py
+$/.PY_LIBS := $/python/AtacDriver.py \
+ $/python/AtacFile.py \
+ $/python/DNA.py \
+ $/python/IdxStore.py \
+ $/python/MatchRecord.py \
+ $/python/MyFile.py \
+ $/python/PerfectRuns.py \
+ $/python/TrimMatchOverlaps.py \
+ $/python/UniqueFilter.py \
+ $/python/dedashMatches.py \
+ $/python/fillIntraRunGaps.py \
+ $/python/mkstats.py \
+ $/python/squeezeIntraRunGaps.py
+
+
+$/.CLEAN := $/*.o $/*/*.o $/*.so $/python/*.pyc
+
+# Dependency generation doesn't know about CFLAGS_PYTHON, so this can appear as a dependency
+# if Python.h isn't in the standard include paths, and then 'No rule to make target 'Python.h',
+# needed by ...' appears. We fix by explicitly pointing to Python.h
+#
+# Unfortunately, Python.h remains out of date (as it would with .PHONY), so we build every time.
+#
+Python.h: ${PYTHON_H}
+
+$/localalign/localAlignerInterfacemodule.o: $/localalign/localAlignerInterfacemodule.C
+ ${-CXX} ${CXX} ${CXXFLAGS} ${CXXFLAGS_COMPILE} ${CFLAGS_PYTHON} -o $@ -c $<
+
+$/localAlignerInterfacemodule.so: $/localalign/GF_ALN_dpaligner.o \
+ $/localalign/GF_ALN_local.o \
+ $/localalign/GF_ALN_overlap.o \
+ $/localalign/GF_ALN_loverlapper.o \
+ $/localalign/GF_ALN_pieceOlap.o \
+ $/localalign/localAlignerInterfacemodule.o
+
+$/halign/halignmodule.o: $/halign/halignmodule.C
+ ${-CXX} ${CXX} ${CXXFLAGS} ${CXXFLAGS_COMPILE} ${CFLAGS_PYTHON} -o $@ -c $<
+
+$/halignmodule.so: $/halign/halign.o \
+ $/halign/halignmodule.o
+
diff --git a/atac-driver/chainer/halign/halign.C b/atac-driver/chainer/halign/halign.C
new file mode 100644
index 0000000..83aad94
--- /dev/null
+++ b/atac-driver/chainer/halign/halign.C
@@ -0,0 +1,556 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Author: Clark Mobarry
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "halign.H"
+
+#if defined (__SVR4) && defined (__sun)
+// Solaris defines SS in sys/regset.h
+#undef SS
+#endif
+
+#define min(x,y) ((x)<=(y) ? (x):(y))
+#define max(x,y) ((x)>=(y) ? (x):(y))
+
+typedef enum { DEL, INS, SUB, MOV } OpType;
+
+#define START 1 // The position of the first character.
+#define GAP_OPEN 1 // These are costs.
+#define GAP_EXTEND 1
+#define MISMATCH 1
+
+class EditScript {
+public:
+ EditScript(OpType op, int nm, EditScript *nx) {
+ op_type = op;
+ num = nm;
+ next = nx;
+ };
+
+ OpType op_type; // SUB, MOV, INS, or DEL
+ int num; // Number of operations
+ EditScript *next;
+};
+
+
+void
+convertScriptToAlignment(EditScript *head,
+ H_Alignment_t *aln) {
+
+ EditScript *tp = head;
+ EditScript *tp1;
+ int scriptLen = 3;
+
+
+ // Three sets of comments, from three different sources:
+ //
+ // Condense_script - merge contiguous operations of the same type
+ // together. Remove the leftmost dummy script (Ascript[0][0])
+ //
+ // Condense the script in block operations; this will modify the
+ // script and will cut off the dummy edit op from the linked list
+ //
+ // Condense any repeat operations. For example, condense this
+ // script: "DEL 5; DEL 3" into this array representation: "DEL 8".
+
+ while (tp && tp->next) {
+ scriptLen += 2;
+
+ while (((tp1 = tp->next) != NULL) &&
+ (tp->op_type == tp1->op_type)) {
+ tp->num += tp1->num;
+ tp->next = tp1->next;
+ free(tp1);
+ }
+
+ tp = tp->next;
+ }
+
+ // Allocate space for the alignment
+ //
+ if (aln->scriptAsArrayMax <= scriptLen) {
+ free(aln->scriptAsArray);
+
+ aln->scriptAsArrayMax = scriptLen;
+ aln->scriptAsArray = (int *)malloc(scriptLen * sizeof(int));
+
+ assert(aln->scriptAsArray != NULL);
+ }
+
+ aln->scriptAsArray[0] = 0;
+
+ // Convert.
+ //
+ int arySize = 0;
+ EditScript *tpdel = 0L;
+
+ tp = head;
+ while (tp != NULL) {
+ aln->scriptAsArray[++arySize] = tp->op_type;
+ aln->scriptAsArray[++arySize] = tp->num;
+
+ tpdel = tp;
+ tp = tp->next;
+ delete tpdel;
+ }
+
+ aln->scriptAsArray[0] = arySize;
+}
+
+
+
+static
+int
+diff(const char *s1,
+ const char *s2,
+ int len1,
+ int len2,
+ int *CC,
+ int *DD,
+ int *RR,
+ int *SS,
+ int g,
+ int h,
+ int x,
+ int start_cgap,
+ int end_cgap,
+ int free_start,
+ int free_end,
+ int tb,
+ int te,
+ EditScript **head,
+ EditScript **tail) {
+
+ int i, j, s, t, c, e, tmp;
+ const char *a, *b;
+ int mincost, mintype, midi, midj;
+ EditScript *tmp_head = 0L;
+ EditScript *tmp_tail = 0L;
+
+ if (len1==0 && len2==0) {
+
+ *head = *tail = NULL;
+ return 0;
+
+ } else if (len2 == 0) {
+ int tmpb, tmpe;
+
+ *head = *tail = new EditScript(DEL,len1,NULL);
+ tmpb = (len1 <= start_cgap) ? 0 : tb+h*(len1-start_cgap);
+ tmpe = (len1 <= end_cgap) ? 0 : te+h*(len1-end_cgap);
+
+ return min(tmpb,tmpe);
+
+ } else if (len1 == 0) {
+
+ *head = *tail = new EditScript(INS,len2,NULL);
+ return ((free_start || free_end) ? 0 : g+(len2*h));
+
+ } else if (len1 == 1) {
+ int tmpcost;
+ char ch;
+
+ /* insert B, delete A; or delete A, insert B */
+ mincost = (start_cgap ? 0 : min(tb,te)+h) + ((free_start || free_end) ? 0 : g+len2*h);
+ mintype = 2; midj = (free_start ? len2 : 0);
+
+ /* ... or insert some B, substitute A, insert the rest of B */
+ for (j=0, ch=*s1; j<len2; j++) {
+ tmpcost = (ch==s2[j]) ? 0 : x;
+ if (!free_start && j) tmpcost += g+j*h;
+ if (!free_end && j+1<len2) tmpcost += g+(len2-j-1)*h;
+ if (tmpcost<mincost) { mincost = tmpcost; mintype = 1; midj = j; }
+ }
+ if (mintype==2) {
+ /* delete A */
+ if (free_start) {
+ *tail = new EditScript(DEL,1,NULL);
+ *head = new EditScript(INS,len2,*tail);
+ } else {
+ *tail = new EditScript(INS,len2,NULL);
+ *head = new EditScript(DEL,1,*tail);
+ }
+ } else { /* substitute A */
+ EditScript *aux;
+ *tail = (midj<len2-1) ? new EditScript(INS,len2-midj-1,NULL) : NULL;
+ aux = new EditScript((ch==s2[midj] ? MOV : SUB),1,*tail);
+ if (*tail==NULL) *tail = aux;
+ *head = (midj>0) ? new EditScript(INS,midj,aux) : aux;
+ }
+ return mincost;
+
+ } else {
+ int tmph, tmpg;
+
+ midi = (int)(len1/2);
+
+ /* compute CC and DD in the forward phase */
+ tmph = free_start ? 0 : h;
+ tmpg = free_start ? 0 : g;
+ for (CC[0]=0, t=tmpg, j=1; j<=len2; j++) {
+ /* if free_start, allow gap-free ends in the genomic sequence */
+ CC[j] = DD[j] = t = t+tmph;
+ DD[j] += tmpg;
+ }
+
+ for (a=s1, i=1; i<=midi; i++, a++) {
+
+ s = CC[0];
+ CC[0] = c = t = max(i-start_cgap,0)*h + (i>start_cgap)*tb;
+
+ e = t + g;
+ for (b=s2, j=1; j<=len2; j++, b++) {
+
+ e = min(e, c+g) + h;
+
+ DD[j] = (j==len2 && i>=len1-end_cgap+1) ?
+ min(DD[j], CC[j]) : (min(DD[j]+(i==start_cgap+1)*g, CC[j]+g) + h);
+
+ c = min(DD[j], min(e, s+x*(*a!=*b)));
+ s = CC[j]; CC[j] = c;
+ }
+ }
+ DD[0] = CC[0];
+
+ /* compute RR and SS in the reverse phase */
+ tmph = free_end ? 0 : h;
+ tmpg = free_end ? 0 : g;
+ for (RR[len2]=0, t=tmpg, j=len2-1; j>=0; --j) {
+ /* if free_end, allow gap-free ends in the genomic sequence */
+ RR[j] = SS[j] = t = t+tmph;
+ SS[j] += tmpg;
+ }
+
+ for (a=s1+len1-1, i=len1-1; i>=midi; --i, --a) {
+
+ s = RR[len2];
+ RR[len2] = c = t = max((len1-end_cgap)-i,0)*h + (i<len1-end_cgap)*te;
+
+ e = t + g;
+ for (b=s2+len2-1, j=len2-1; j>=0; --j, --b) {
+
+ e = min(e, c+g) + h;
+
+ SS[j] = (j==0 && i<start_cgap) ?
+ min(SS[j], RR[j]) : (min(SS[j]+(i==len1-end_cgap-1)*g, RR[j]+g) + h);
+ c = min(SS[j], min(e, s+x*(*a!=*b)));
+ s = RR[j]; RR[j] = c;
+
+
+ }
+ }
+ SS[len2] = RR[len2];
+ }
+
+ /* find midj that minimizes the sum */
+ /* special cases: columns 0 and len2 */
+ midj = 0;
+ if (CC[0]+RR[0]<=DD[0]+SS[0]-g*(midi>start_cgap)) {
+ mincost = CC[0]+RR[0];
+ mintype = 1;
+ } else {
+ mincost = DD[0]+SS[0]-g*(midi>start_cgap);
+ mintype = 2;
+ }
+
+ for (j=1; j<len2; j++) {
+ tmp = min(CC[j]+RR[j],DD[j]+SS[j]-g);
+ if (mincost > tmp) {
+ mincost = tmp;
+ midj = j;
+ mintype = (tmp == CC[j]+RR[j]) ? 1:2;
+ }
+ }
+
+ tmp = min(CC[len2]+RR[len2],DD[len2]+SS[len2]-g*(midi<len1-end_cgap));
+ if (mincost > tmp) {
+ mincost = tmp;
+ midj = len2;
+ mintype = (tmp==CC[len2]+RR[len2]) ? 1:2;
+ }
+ /* compute recursively in the two subregions */
+ if (mintype==1) {
+ int cost1, cost2;
+
+ cost1 = diff(s1,
+ s2,
+ midi,
+ midj,
+ CC,
+ DD,
+ RR,
+ SS,
+ g,
+ h,
+ x,
+ min(start_cgap,midi),
+ max(end_cgap-len1+midi,0),
+ free_start,
+ 0,
+ tb,
+ g,
+ head,
+ &tmp_tail);
+ cost2 = diff(s1+midi,
+ s2+midj,
+ len1-midi,
+ len2-midj,
+ CC,
+ DD,
+ RR,
+ SS,
+ g,
+ h,
+ x,
+ max(0,start_cgap-midi),
+ min(end_cgap,len1-midi),
+ 0,
+ free_end,
+ g,
+ te,
+ &tmp_head,
+ tail);
+
+ if (*head)
+ tmp_tail->next = tmp_head;
+ else
+ *head = tmp_head;
+
+ assert(NULL != *tail);
+
+ } else {
+ EditScript *aux;
+ int cost1, cost2;
+
+ cost1 = diff(s1,
+ s2,
+ midi-1,
+ midj,
+ CC,
+ DD,
+ RR,
+ SS,
+ g,
+ h,
+ x,
+ min(start_cgap,midi-1),
+ max(end_cgap-len1+midi-1,0),
+ free_start,
+ 0,
+ tb,
+ 0,
+ head,
+ &tmp_tail);
+
+ aux = new EditScript(DEL,2,NULL);
+
+ if (*head)
+ tmp_tail->next = aux;
+ else
+ tmp_tail = *head = aux;
+
+ cost2 = diff(s1+midi+1,
+ s2+midj,
+ len1-midi-1,
+ len2-midj,
+ CC,
+ DD,
+ RR,
+ SS,
+ g,
+ h,
+ x,
+ max(0,start_cgap-midi-1),
+ min(end_cgap,len1-midi-1),
+ 0,
+ free_end,
+ 0,
+ te,
+ &tmp_head,
+ tail);
+ aux->next = tmp_head;
+
+ if (*tail==NULL)
+ *tail = aux;
+ }
+
+ return mincost;
+}
+
+
+
+void
+halignStart(char *s1,
+ char *s2,
+ H_Alignment_t *alignment) {
+
+ int const offset1 = 0; // Sequence coordinates are base-based, starting from 0
+ int const offset2 = 0; // but start from 1 in Liliana's code.
+
+ if ((s1[0] == 0) || (s2[0] == 0))
+ return;
+
+ int len1 = strlen(s1);
+ int len2 = strlen(s2);
+
+ int start_cgap = 0;
+ int end_cgap = 0;
+ int free_start = 0;
+ int free_end = 0;
+ int score = 0;
+
+ EditScript *Script_head=NULL;
+ EditScript *Script_tail=NULL;
+
+ int *CC = (int *)malloc(4 * (len2+1) * sizeof(int));
+ assert(NULL != CC);
+
+ score = diff(s1,
+ s2,
+ len1,
+ len2,
+ CC,
+ CC+1*(len2+1),
+ CC+2*(len2+1),
+ CC+3*(len2+1),
+ GAP_OPEN,
+ GAP_EXTEND,
+ MISMATCH,
+ start_cgap,
+ end_cgap,
+ free_start,
+ free_end,
+ GAP_OPEN,
+ GAP_OPEN,
+ &Script_head,
+ &Script_tail);
+
+ free(CC);
+
+ assert(NULL != Script_head);
+ assert(NULL != Script_tail);
+
+ Script_tail->next = NULL;
+
+ convertScriptToAlignment(Script_head, alignment);
+
+ alignment->offset1 = offset1+START; // Convert from zero to one start sequence.
+ alignment->offset2 = offset2+START; // Convert from zero to one start sequence.
+ alignment->len1 = len1;
+ alignment->len2 = len2;
+ alignment->score = score;
+ alignment->first = 1;
+}
+
+
+int
+iterateUngappedAlignSharpEnds(H_Alignment_t *aln,
+ int &bgn1,
+ int &bgn2,
+ int &len1,
+ int &len2,
+ int &nmatInSeg) {
+
+ // Returns zero when exhasted.
+ // Returns one when the args are valid.
+
+ static int *lastS, *endS;
+ static int i, j;
+
+ nmatInSeg = 0;
+
+ if(aln == NULL) return 0; // not valid output
+
+ if(aln->first){
+ aln->first = 0;
+ i = aln->offset1;
+ j = aln->offset2;
+
+ lastS = aln->scriptAsArray + 1;
+ endS = aln->scriptAsArray + aln->scriptAsArray[0];
+ }
+
+ while (lastS <= endS) {
+ int b1, l1, b2, l2;
+ int nmat;
+ switch (*lastS) {
+ case SUB:
+ ++lastS;
+ i += *(lastS);
+ j += *(lastS);
+ ++lastS;
+ break;
+
+ case MOV:
+ nmat = *(lastS+1);
+ nmatInSeg ++;
+ b1 = i;
+ b2 = j;
+ lastS++;
+ i += *lastS;
+ j += *lastS;
+ l1 = i-b1;
+ l2 = j-b2;
+ lastS++;
+
+ while (lastS<=endS && (*lastS==SUB || *lastS==MOV)) {
+ nmat += (*lastS==MOV) ? *(lastS+1) : 0;
+ nmatInSeg += (*lastS==MOV) ? *(lastS+1) : 0;
+ lastS++;
+ i += *lastS;
+ j += *lastS;
+ if (*(lastS-1) == MOV) {
+ l1 = i-b1;
+ l2 = j-b2;
+ }
+ lastS++;
+ }
+
+ bgn1=b1-START;
+ bgn2=b2-START;
+ len1=l1;
+ len2=l2;
+
+ return 1; // valid output
+ break;
+
+ case INS:
+ j += *(++lastS);
+ ++lastS;
+ break;
+
+ case DEL:
+ i += *(++lastS);
+ ++lastS;
+ break;
+
+ default :
+ fprintf(stderr, "Unrecognized opcode in alignment.\n");
+ exit(1);
+ break;
+ }
+ }
+ return 0; // not valid output
+}
diff --git a/atac-driver/chainer/halign/halign.H b/atac-driver/chainer/halign/halign.H
new file mode 100644
index 0000000..c345728
--- /dev/null
+++ b/atac-driver/chainer/halign/halign.H
@@ -0,0 +1,54 @@
+/**************************************************************************
+ * This file is part of A2Amapper.
+ * Copyright (c) 2004 Applera Corporation
+ * Author: Clark Mobarry
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received (LICENSE.txt) a copy of the GNU General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**************************************************************************/
+
+#ifndef _ALIGN__HALIGN_H_
+#define _ALIGN__HALIGN_H_
+
+// CMM 2004 Feb 05: The variable "first" does not really belong in
+// the alignment object. We should put it into a separate iterator
+// over alignment struct/class. Currently, this is called by Python
+// so I would need to learn how to make C++ glue code for Python
+// objects. Currently I just make glue code for Python modules.
+
+typedef struct H_Alignment_t {
+ int offset1;
+ int offset2;
+ int len1;
+ int len2;
+ int score;
+ int first;
+ int scriptAsArrayMax;
+ int *scriptAsArray;
+} H_Alignment_t;
+
+void halignStart(char *string1,
+ char *string2,
+ H_Alignment_t *alignment);
+
+
+int iterateUngappedAlignSharpEnds(H_Alignment_t *aln,
+ int &bgn1,
+ int &bgn2,
+ int &len1,
+ int &len2,
+ int &nmat);
+
+#endif
+
diff --git a/atac-driver/chainer/halign/halignDriver.C b/atac-driver/chainer/halign/halignDriver.C
new file mode 100644
index 0000000..ab60266
--- /dev/null
+++ b/atac-driver/chainer/halign/halignDriver.C
@@ -0,0 +1,58 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Author: Clark Mobarry
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include "halign.H"
+
+int main(int argc, char **argv) {
+ char *seq1 = "ATCGTCCGGATGAAAATGTCTCGGGGGGGGGGGTCGGG";
+ char *seq2 = "ATCGTCTGGATGAAAAAGTCTCAAGGG";
+
+ // This is the setting for the coordinate system to be just the match.
+ int offset1 = 0;
+ int offset2 = 0;
+
+ int bgn1, bgn2, len1, len2, nmat;
+
+ H_Alignment_t * aln_ptr = NULL;
+
+ // Sequence coordinates are base-based, starting from 0
+ halignStart(seq1+offset1, // This is the first base in the comparison.
+ seq2+offset2,
+ offset1,
+ offset2,
+ strlen(seq1),
+ strlen(seq2),
+ &aln_ptr);
+
+ //printUngappedAlign(aln_ptr);
+ //printUngappedAlignSharpEnds(aln_ptr);
+
+ printUngappedAlignSharpEndsOnConsole(aln_ptr, seq1, seq2, 0);
+ printUngappedAlignSharpEndsOnConsole(aln_ptr, seq1, seq2, 1);
+ printUngappedAlignSharpEndsOnConsole(aln_ptr, seq1, seq2, 2);
+
+ while(iterateUngappedAlignSharpEnds(aln_ptr, bgn1, bgn2, len1, len2, nmat))
+ printf("%d %d %d %d\n", bgn1, bgn2, len1, len2 );
+
+ if(aln_ptr != NULL)
+ Free_align(aln_ptr);
+
+ // Must call for each halign() but after printing output.
+
+ exit(0);
+}
diff --git a/atac-driver/chainer/halign/halignmodule.C b/atac-driver/chainer/halign/halignmodule.C
new file mode 100644
index 0000000..2015647
--- /dev/null
+++ b/atac-driver/chainer/halign/halignmodule.C
@@ -0,0 +1,60 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Author: Clark Mobarry
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <Python.h>
+#include "halign.H"
+
+H_Alignment_t aln = { 0, 0, 0, 0, 0, 0, 0, 0};
+
+static PyObject *
+spam_halignStart(PyObject *self, PyObject *args) {
+ char *seq1 = 0L;
+ char *seq2 = 0L;
+
+ if (!PyArg_ParseTuple(args, "ss", &seq1, &seq2))
+ return(NULL);
+
+ halignStart(seq1, seq2, &aln);
+
+ Py_INCREF(Py_None);
+ return(Py_None);
+}
+
+static PyObject *
+spam_halignDedash( PyObject *self, PyObject *args) {
+ int bgn1=0, bgn2=0, len1=0, len2=0, nmat=0;
+
+ if (iterateUngappedAlignSharpEnds(&aln, bgn1, bgn2, len1, len2, nmat))
+ return(Py_BuildValue("(iiiii)", bgn1, bgn2, len1, len2, nmat));
+
+ Py_INCREF(Py_None);
+ return(Py_None);
+}
+
+static
+PyMethodDef
+registration_table[] = {
+ {"halignStart", spam_halignStart, METH_VARARGS, "initialize halign"},
+ {"halignDedash", spam_halignDedash, METH_VARARGS, "dedashed subalignment"},
+ {NULL, NULL, 0, NULL}
+};
+
+extern "C"
+void inithalign(void) {
+ Py_InitModule("halign", registration_table);
+}
diff --git a/atac-driver/chainer/localalign/GF_ALN_dpaligner.C b/atac-driver/chainer/localalign/GF_ALN_dpaligner.C
new file mode 100644
index 0000000..905a218
--- /dev/null
+++ b/atac-driver/chainer/localalign/GF_ALN_dpaligner.C
@@ -0,0 +1,175 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Author: Clark Mobarry
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/* Dynamic programming sequence comparison of two fragments. General
+ purpose utility that uses bit-vector d.p. for detection (see, "A Fast
+ Bit-Vector Algorithm for Approximate String Matching on Dynamic
+ Programming" J. ACM., to appear, by Gene Myers.) and the O(kn) greedy
+ algorithm for alignment delivery (see "An O(ND) Difference Algorithm
+ and Its Variations" Algorithmica 1 (1986), 251-266, by Gene Myers.)
+ Both papers can be downloaded from
+ "http://www.cs.arizona.edu/people/gene/vita.html"
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#include "GF_ALN_local.H"
+
+/* O(kn) identity-based alignment algorithm. Find alignment between
+ a and b (of lengths alen and blen), that begins at finishing
+ boundary position *spnt. Return at *spnt the diagonal at which the
+ alignment starts. */
+
+int *AS_ALN_OKNAlign(const char *a, int alen, const char *b, int blen, int *spnt, int diff) {
+ int diag, wpos, level;
+ int fcell, infinity;
+
+ static int Wtop = -1;
+ static int *Wave;
+ static int *TraceBuffer;
+
+ if (diff >= Wtop) /* Space for diff wave? */
+ { int max, del, *newp;
+
+ max = (int)(1.2*diff) + 50;
+ del = (max+5)*(max+1);
+ //fprintf(stderr,"DP_COMPARE (AS_ALN_OKNAlign): reallocing %ld bytes\n",del*sizeof(int)+(max+1)*sizeof(int));
+ newp = (int *) realloc(Wave,del*sizeof(int) + (max+1)*sizeof(int));
+ if (newp == NULL) return (NULL);
+ Wtop = max-1;
+ Wave = newp;
+ TraceBuffer = (int *) (Wave + del);
+ }
+
+ diag = (alen-blen) + (*spnt); /* Finish diagonal. */
+ infinity = blen+2;
+
+ /* Process 0-wave. */
+
+ { int i, j;
+
+ if (diff == 0) goto zeroscript;
+
+ if ((*spnt) < 0) /* (i,j) = initial boundary pt. */
+ j = blen;
+ else
+ j = blen - (*spnt);
+ i = diag + j;
+
+ while (1)
+ { if (i <= 0 || j <= 0) goto zeroscript;
+ if (a[i] != b[j]) break;
+ i -= 1;
+ j -= 1;
+ }
+
+ Wave[0] = Wave[1] = infinity;
+ Wave[2] = j;
+ Wave[3] = Wave[4] = infinity;
+ }
+
+ /* Compute waves 1 through d-1 do, each wave has
+ two boundary cells at each of its ends. */
+
+ { int m, n, k;
+
+ m = 5;
+ n = 0;
+ for (level = 1; 1; level++)
+ { Wave[m++] = infinity;
+ Wave[m++] = infinity;
+ n += 1;
+ for (k = -level; k <= level; k++)
+ { int i, j;
+
+ j = Wave[n] - 1;
+ if ((i = Wave[n-1]-1) < j)
+ j = i;
+ if ((i = Wave[n+1]) < j)
+ j = i;
+ i = (diag+k) + j;
+ while (1)
+ { if (i <= 0 || j <= 0)
+ { if (i <= 0)
+ *spnt = -j;
+ else
+ *spnt = i;
+ goto madeit;
+ }
+ if (a[i] != b[j]) break;
+ i -= 1;
+ j -= 1;
+ }
+ Wave[m++] = j;
+ n += 1;
+ }
+ Wave[m++] = infinity;
+ Wave[m++] = infinity;
+ n += 1;
+ }
+
+madeit:
+
+
+ fcell = n;
+ wpos = k;
+ }
+
+ /* Trace back through wave structure and record
+ trace of the alignment traced. */
+
+ { int d, n, k, t;
+
+ t = 0;
+ n = fcell;
+ k = wpos;
+ for (d = level-1; d >= 0; d--)
+ { int i, j, m;
+
+ j = Wave[m=n]-1;
+ if ((i = Wave[n-1]-1) < j)
+ { j = i; m = n-1; }
+ if ((i = Wave[n+1]) < j)
+ { j = i; m = n+1; }
+ if (m < n)
+ { TraceBuffer[t++] = - ((diag+k) + (j+1));
+ k -= 1;
+ }
+ else if (m > n)
+ { TraceBuffer[t++] = j+1;
+ k += 1;
+ }
+ n = m - (2*d+4);
+ }
+ TraceBuffer[t] = 0;
+ }
+
+ return (TraceBuffer);
+
+ /* If perfect match, your done. */
+
+zeroscript:
+ TraceBuffer[0] = 0;
+ *spnt = diag;
+ return (TraceBuffer);
+}
diff --git a/atac-driver/chainer/localalign/GF_ALN_local.C b/atac-driver/chainer/localalign/GF_ALN_local.C
new file mode 100644
index 0000000..86ced17
--- /dev/null
+++ b/atac-driver/chainer/localalign/GF_ALN_local.C
@@ -0,0 +1,1189 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Author: Clark Mobarry
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include "GF_ALN_local.H"
+
+// Note, KMERLEN 5, MINMATCH 20, MAXERROR 2, KTHRESH 6 is reasonable
+// for performing fragment against fragment comparisons; it permits
+// relatively small segments to be found; but it will not give
+// acceptable run time for large comparisons such as a BAC against a
+// BAC etc. So, ...
+
+#define KMERLEN 6 // Must be >= 1
+#define MINMATCH 20 // (MINMATCH-KMERLEN) is the maximum jump distance.
+#define MAXERROR 2 // maximum slop in diagnols for chaining KMERLEN hits.
+
+// The minimum number kmer hits that constitutes an acceptable chain.
+#define KTHRESH (MINMATCH - (KMERLEN-1) - KMERLEN*MAXERROR)
+
+#define min(a,b) (a<b?a:b)
+#define max(a,b) (a>b?a:b)
+
+
+
+
+/* D.P. extension alignment scoring */
+
+/* N.B.: the larger MAXIGAP, the longer subpart of a trapezoid (potential
+ segment) can be missed; this occurs if (a) Align_Recursion starts
+ in the subpart, with a bad region on both sides of the subpart;
+ the forward pass extends across one of the low-quality regions
+ into a long high-quality region but then the reverse pass doesn't
+ get back across the low-quality region--or rather, it does, but the
+ best score doesn't.
+
+ Of course, the smaller MAXIGAP, the shorter the segments and the
+ less the chance of extending alignments across gaps between trapezoids.
+
+ Setting MAXIGAP small allows alignment extension to end relatively
+ early, so that a short high-quality region bounded by low-quality
+ regions can end up being its own segment. And, it also has the
+ effect that if a low-quality region is small enough to get through,
+ then it doesn't take that long a high-quality segment to increase
+ the score back up to a new maximum value.
+*/
+
+#define MAXIGAP_DEFAULT 3
+
+static int MAXIGAP=MAXIGAP_DEFAULT;
+
+
+/*amount to subtract from score for mismatch */
+//#define DIFFCOST 14
+// we can define it to 14 in order not to extend the alignment
+// only at a high level of stringency
+#define DIFFCOST 3
+
+/*amount to add to score for match*/
+#define SAMECOST 1
+
+static int diffcost=DIFFCOST;
+static int samecost=SAMECOST;
+
+/* Trapezoid merging padding */
+
+#define DPADDING 2
+#define BPADDING KMERLEN+2
+
+static int BLOCKCOST = DIFFCOST*MAXIGAP_DEFAULT;
+static int MATCHCOST = DIFFCOST+SAMECOST;
+
+
+/* Major data types */
+
+/* Hit Record: Description of indexed based seed-match */
+typedef struct {
+ int diagonal; /* Diagonal of hit */
+ int bstart; /* B position of start of hit */
+ int bfinish; /* B position of end of hit */
+} HitRecord;
+
+/* Trapezoid Record: Description of trapezoidal match zone */
+typedef struct _Trap_Tag {
+ struct _Trap_Tag *next; /* Organized in a list linked on this field */
+ int top, bot; /* B-coords of top and bottom of trapzoidal zone */
+ int lft, rgt; /* Left and right diagonals of trapzoidal zone */
+} Trapezoid;
+
+
+/*** UTILITY ROUTINES ***/
+
+static
+void
+OutOfMemory(char const * const where) {
+ fprintf(stderr,"COMPARE_LOCAL: Out of memory (%s)\n",where);
+ exit (1);
+}
+
+static void Complement(char * const seq, int const len) {
+ static char WCinvert[256];
+ static int Firstime = 1;
+
+ if (Firstime) { /* Setup complementation array */
+ int i;
+
+ Firstime = 0;
+ for(i = 0; i < 256;i++){
+ WCinvert[i] = '?';
+ }
+ WCinvert[(int)'a'] = 't';
+ WCinvert[(int)'c'] = 'g';
+ WCinvert[(int)'g'] = 'c';
+ WCinvert[(int)'t'] = 'a';
+ WCinvert[(int)'n'] = 'n';
+ WCinvert[(int)'A'] = 'T';
+ WCinvert[(int)'C'] = 'G';
+ WCinvert[(int)'G'] = 'C';
+ WCinvert[(int)'T'] = 'A';
+ WCinvert[(int)'N'] = 'N';
+ WCinvert[(int)'-'] = '-'; // added this to enable alignment of gapped consensi
+ }
+
+ /* Complement and reverse sequence */
+
+ {
+ register char *s, *t;
+ int c;
+
+ s = seq;
+ t = seq + (len-1);
+ while (s < t)
+ { c = *s;
+ *s++ = WCinvert[(int) *t];
+ *t-- = WCinvert[c];
+ }
+ if (s == t)
+ *s = WCinvert[(int) *s];
+ }
+}
+
+
+/*** INDEX CONSTRUCTION AND APPLICATION TO FILTERING ***/
+
+/* Shared index and filter arrays used in this subsection */
+
+typedef struct {
+ int minim;
+ int maxim;
+ int count;
+} DiagRecord;
+
+static int Kmask = -1;
+static int *Table = NULL; /* [0..Kmask+1] */
+static int *Tuples = NULL; /* [0..<Seqlen>-KMERLEN] */
+static int Map[128];
+
+static DiagRecord *DiagVec; /* [-(Alen-KMERLEN)..(Blen-KMERLEN) + MAXERROR] */
+
+
+/* Reverse complement sequences -- so we do not recompute them over and over */
+static char *BrevC=NULL;
+
+/* Build index table for sequence S of length Slen. */
+
+static void TableBuild(char const * const S, int const Slen)
+{ int i, c;
+ int x, h;
+ char const * const s = S+(KMERLEN-1);
+
+ for (c = 0; c <= Kmask; c++)
+ Table[c] = 0;
+
+ h = -KMERLEN;
+ c = 0;
+ for (i = 0; i < KMERLEN-1; i++)
+ { x = Map[(int) (S[i])];
+ if (x >= 0)
+ c = (c << 2) | x;
+ else
+ { c <<= 2; h = i-(KMERLEN-1); }
+ }
+ for (i = 0; i <= Slen-KMERLEN; i++)
+ { x = Map[(int) (s[i])];
+ if (x >= 0)
+ c = ((c << 2) | x) & Kmask;
+ else
+ { c = (c << 2) & Kmask; h = i; }
+ if (i >= h+KMERLEN)
+ Table[c+1] += 1;
+ }
+
+ for (c = 2; c <= Kmask; c++)
+ Table[c] += Table[c-1];
+
+ h = -KMERLEN;
+ c = 0;
+ for (i = 0; i < KMERLEN-1; i++)
+ { x = Map[(int) (S[i])];
+ if (x >= 0)
+ c = (c << 2) | x;
+ else
+ { c <<= 2; h = i-(KMERLEN-1); }
+ }
+ for (i = 0; i <= Slen-KMERLEN; i++)
+ { x = Map[(int) (s[i])];
+ if (x >= 0)
+ c = ((c << 2) | x) & Kmask;
+ else
+ { c = (c << 2) & Kmask; h = i; }
+ if (i >= h+KMERLEN)
+ Tuples[Table[c]++] = i;
+ }
+
+ for (c = Kmask; c >= 0; c--)
+ Table[c+1] = Table[c];
+ Table[0] = 0;
+}
+
+/* Apply index to find filtered hits between sequences, returning pointer to
+ array of HitRecords of length in the integer pointed at by Hitlen */
+
+static int HSORT(const void *l, const void *r)
+{ HitRecord *x, *y;
+ x = (HitRecord *) l;
+ y = (HitRecord *) r;
+ return (x->bstart - y->bstart);
+}
+
+static HitRecord *Find_Hits
+(char const * const A, int const Alen,
+ char const * const B, int const Blen, int * const Hitlen)
+{ static int HitMax = -1;
+ static HitRecord *HitList;
+ int hits, disconnect;
+
+ if (HitMax < 0)
+ { HitMax = 10000;
+ HitList = (HitRecord *) malloc(sizeof(HitRecord)*HitMax);
+ if (HitList == NULL)
+ OutOfMemory("Hit list");
+ }
+
+ { int i, j, c;
+ int x, h;
+ char const * const b = B + (KMERLEN-1);
+
+ for (j = -Alen; j <= Blen+MAXERROR; j++)
+ { DiagRecord *dp;
+ dp = DiagVec + j;
+ dp->count = dp->maxim = 0;
+ }
+
+ hits = 0;
+ disconnect = MINMATCH - KMERLEN;
+ h = -KMERLEN;
+ c = 0;
+ for (i = 0; i < KMERLEN-1; i++)
+ { x = Map[(int) (B[i])];
+ if (x >= 0)
+ c = (c << 2) | x;
+ else
+ { c <<= 2; h = i-(KMERLEN-1); }
+ }
+
+ for (i = 0; i <= Blen-KMERLEN; i++)
+ { x = Map[(int) (b[i])];
+ if (x >= 0)
+ c = ((c << 2) | x) & Kmask;
+ else
+ { c = (c << 2) & Kmask; h = i; }
+ if (i >= h+KMERLEN)
+ for (j = Table[c]; j < Table[c+1]; j++)
+ { DiagRecord *dp;
+ int e, k;
+ k = i-Tuples[j];
+ dp = DiagVec + k;
+ for (e = 0; e <= MAXERROR; e++)
+ { if (dp->maxim < i-disconnect)
+ { if (dp->count >= KTHRESH)
+ { HitRecord *hp;
+ if (hits >= HitMax)
+ { HitMax = (int)(1.2*hits) + 5000;
+ HitList = (HitRecord *) realloc(HitList,
+ sizeof(HitRecord)*HitMax);
+ if (HitList == NULL)
+ OutOfMemory("Hit list");
+ }
+ hp = HitList + hits;
+ hp->diagonal = k;
+ hp->bstart = dp->minim;
+ hp->bfinish = dp->maxim + KMERLEN;
+ hits += 1;
+ }
+ dp->count = 0;
+ }
+ if (dp->count == 0)
+ dp->minim = i;
+ dp->count += 1;
+ dp->maxim = i;
+ dp += 1;
+ }
+ }
+ }
+
+ for (j = -Alen; j <= Blen+MAXERROR; j++)
+ { DiagRecord *dp;
+ dp = DiagVec + j;
+ if (dp->count >= KTHRESH)
+ { HitRecord *hp;
+ if (hits >= HitMax)
+ { HitMax = (int)(1.2*hits) + 5000;
+ HitList = (HitRecord *)realloc(HitList,sizeof(HitRecord)*HitMax);
+ if (HitList == NULL)
+ OutOfMemory("Hit list");
+ }
+ hp = HitList + hits;
+ hp->diagonal = j;
+ hp->bstart = dp->minim;
+ hp->bfinish = dp->maxim + KMERLEN;
+ hits += 1;
+ }
+ }
+ }
+
+ qsort(HitList,hits,sizeof(HitRecord),HSORT);
+
+ *Hitlen = hits;
+ return (HitList);
+}
+
+
+/*** FORWARD AND REVERSE D.P. EXTENSION ROUTINES ***/
+/* Called at the mid-point of trapezoid -- mid X [lo,hi], the extension
+ is computed to an end point and the lowest and highest diagonals
+ are recorded. These are returned in a partially filled Local_Segment
+ record, that will be merged with that returned for extension in the
+ opposite direction.
+*/
+
+Local_Segment *TraceForwardPath
+( char const * const A, int const Alen,
+ char const * const B, int const Blen,
+ int const mid, int lo, int hi)
+{ static Local_Segment rez;
+ int *V;
+ int mxv, mxl, mxr, mxi, mxj;
+ int i, j;
+ int *Base1, *Base2;
+
+ Base1 = ((int *) DiagVec);
+ Base2 = Base1 + (Blen+1);
+
+ /* Set basis from (mid,lo) .. (mid,hi) */
+
+ V = Base1;
+ if (lo < 0) lo = 0;
+ if (hi > Blen) hi = Blen;
+
+ for (j = lo; j <= hi; j++)
+ V[j] = 0;
+ hi += MAXIGAP;
+ if (hi > Blen) hi = Blen;
+ for (; j <= hi; j++)
+ V[j] = V[j-1] - diffcost;
+ mxv = 0;
+ mxr = mid - lo;
+ mxl = mid - hi;
+ mxi = mid;
+ mxj = lo;
+
+ /* Advance to next row */
+
+ for (i = mid; lo <= hi && i < Alen; i++)
+ { int c, v;
+ int *W;
+
+ W = V;
+ if (V == Base1)
+ V = Base2;
+ else
+ V = Base1;
+
+ v = W[lo];
+ c = V[lo] = v - diffcost;
+ for (j = lo+1; j <= hi; j++)
+ { int r, t;
+
+ t = c;
+ c = v;
+ v = W[j];
+ if (Map[(int)A[i]] == Map[(int)B[j-1]] && Map[(int) (A[i])] >= 0)
+ c += MATCHCOST;
+
+ r = c;
+ if (v > r) r = v;
+ if (t > r) r = t;
+
+ V[j] = c = r - diffcost;
+ if (c >= mxv)
+ { mxv = c;
+ mxi = i+1;
+ mxj = j;
+ //printf("reset mxv = %d at [%d,%d]\n",mxv,mxi,mxj);
+ }
+ }
+
+ if (j <= Blen)
+ { int r;
+
+ if (Map[(int)A[i]] == Map[(int)B[j-1]] && Map[(int) (A[i])] >= 0)
+ v += MATCHCOST;
+
+ r = v;
+ if (c > r) r = c;
+
+ V[j] = v = r - diffcost;
+ if (v > mxv)
+ { mxv = v;
+ mxi = i+1;
+ mxj = j;
+ //printf("reset mxv = %d at [%d,%d]\n",mxv,mxi,mxj);
+ }
+
+ for (j++; j <= Blen; j++)
+ { v -= diffcost;
+ if (v < mxv - BLOCKCOST) break;
+ V[j] = v;
+ }
+ }
+
+ hi = j-1;
+
+ while (lo <= hi && V[lo] < mxv - BLOCKCOST)
+ lo += 1;
+ while (lo <= hi && V[hi] < mxv - BLOCKCOST)
+ hi -= 1;
+
+ if ((i+1) - lo > mxr)
+ mxr = (i+1) - lo;
+ if ((i+1) - hi < mxl)
+ mxl = (i+1) - hi;
+ }
+
+ rez.aepos = mxj;
+ rez.bepos = mxi;
+ rez.ldiag = mxl;
+ rez.hdiag = mxr;
+ rez.score = mxv;
+ return (&rez);
+}
+
+Local_Segment *TraceReversePath(char const * const A, int const Alen,
+ char const * const B, int const Blen,
+ int const top, int lo, int hi,
+ int const bot, int xfactor) {
+ static Local_Segment rez;
+ int *V;
+ int mxv, mxl, mxr, mxi, mxj;
+ int i, j;
+ int *Base1, *Base2;
+
+ Base1 = ((int *) DiagVec);
+ Base2 = Base1 + (Blen+1);
+
+ /* Set basis from (top,lo) .. (top,hi) */
+
+ V = Base1;
+ if (lo < 0) lo = 0;
+ if (hi > Blen) hi = Blen;
+
+ for (j = hi; j >= lo; j--)
+ V[j] = 0;
+ lo -= MAXIGAP;
+ if (lo < 0) lo = 0;
+ for (; j >= lo; j--)
+ V[j] = V[j+1] - diffcost;
+ mxv = 0;
+ mxr = top - lo;
+ mxl = top - hi;
+ mxi = top;
+ mxj = lo;
+
+ /* Advance to next row */
+
+ if (top-1 <= bot) xfactor = BLOCKCOST;
+
+ for (i = top-1; lo <= hi && i >= 0; i--)
+ { int c, v;
+ int *W;
+
+ W = V;
+ if (V == Base1)
+ V = Base2;
+ else
+ V = Base1;
+
+
+ v = W[hi];
+ c = V[hi] = v - diffcost;
+ for (j = hi-1; j >= lo; j--)
+ { int r, t;
+
+ t = c;
+ c = v;
+ v = W[j];
+ if (Map[(int)A[i]] == Map[(int)B[j]] && Map[(int) (A[i])] >= 0)
+ c += MATCHCOST;
+
+ r = c;
+ if (v > r) r = v;
+ if (t > r) r = t;
+
+ V[j] = c = r - diffcost;
+ if (c >= mxv)
+ { mxv = c;
+ mxi = i;
+ mxj = j;
+ //printf("reset mxv = %d at [%d,%d]\n",mxv,mxi,mxj);
+ }
+ }
+
+ if (j >= 0)
+ { int r;
+
+ if (Map[(int)A[i]] == Map[(int)B[j]] && Map[(int) (A[i])] >= 0)
+ v += MATCHCOST;
+
+ r = v;
+ if (c > r) r = c;
+
+ V[j] = v = r - diffcost;
+ if (v > mxv)
+ { mxv = v;
+ mxi = i;
+ mxj = j;
+ //printf("reset mxv = %d at [%d,%d]\n",mxv,mxi,mxj);
+ }
+
+ for (j--; j >= 0; j--)
+ { v -= diffcost;
+ if (v < mxv - xfactor) break;
+ V[j] = v;
+ }
+ }
+
+ lo = j+1;
+
+ while (lo <= hi && V[lo] < mxv - xfactor)
+ lo += 1;
+ while (lo <= hi && V[hi] < mxv - xfactor)
+ hi -= 1;
+
+ if (i == bot) xfactor = BLOCKCOST;
+
+ if (i-lo > mxr)
+ mxr = i-lo;
+ if (i-hi < mxl)
+ mxl = i-hi;
+ }
+
+ rez.abpos = mxj;
+ rez.bbpos = mxi;
+ rez.ldiag = mxl;
+ rez.hdiag = mxr;
+ rez.score = mxv;
+ return (&rez);
+}
+
+
+/*** MERGING INDEX HITS INTO TRAPEZOIDAL ZONES ***/
+
+static Trapezoid *Build_Trapezoids(char const * const A, int const Alen,
+ char const * const B, int const Blen,
+ HitRecord const * const list, int const Hitlen, int * const Traplen) {
+ static Trapezoid *free = NULL;
+
+ Trapezoid *traporder, *traplist, *tailend;
+ Trapezoid *b, *f, *t;
+ int i, inserted;
+ int trapcount, traparea;
+
+ trapcount = 0;
+ traparea = 0;
+ traporder = NULL;
+ traplist = NULL;
+ for (i = 0; i < Hitlen; i++) {
+ inserted = 0;
+ f = NULL;
+ for (b = traporder; b != NULL; b = t) {
+ t = b->next;
+ if (b->top < list[i].bstart - BPADDING) {
+ trapcount += 1;
+ traparea += (b->top - b->bot + 1) * (b->rgt - b->lft + 1);
+ if (f == NULL)
+ traporder = t;
+ else
+ f->next = t;
+ b->next = traplist;
+ traplist = b;
+ }
+ else if (list[i].diagonal > b->rgt + DPADDING)
+ f = b;
+ else if (list[i].diagonal >= b->lft - DPADDING) {
+ if (list[i].diagonal < b->lft)
+ b->lft = list[i].diagonal;
+ if (list[i].diagonal > b->rgt)
+ b->rgt = list[i].diagonal;
+ if (list[i].bfinish > b->top)
+ b->top = list[i].bfinish;
+
+ if (f != NULL && f->rgt + DPADDING >= b->lft) {
+ f->rgt = b->rgt;
+ if (f->bot > b->bot) f->bot = b->bot;
+ if (f->top < b->top) f->top = b->top;
+ f->next = t;
+ b->next = free;
+ free = b;
+ }
+ else if (t != NULL && t->lft - DPADDING <= b->rgt) {
+ b->rgt = t->rgt;
+ if (b->bot > t->bot) b->bot = t->bot;
+ if (b->top < t->top) b->top = t->top;
+ b->next = t->next;
+ t->next = free;
+ free = t;
+ t = b->next;
+ f = b;
+ }
+ else
+ f = b;
+ inserted = 1;
+ } else if (! inserted) {
+ if (free == NULL) {
+ free = (Trapezoid *)malloc(sizeof(Trapezoid));
+ if (free == NULL)
+ OutOfMemory("Trapezoid scan list");
+ free->next = NULL;
+ }
+ if (f == NULL)
+ f = traporder = free;
+ else
+ f = f->next = free;
+ free = f->next;
+ f->next = b;
+ f->top = list[i].bfinish;
+ f->bot = list[i].bstart;
+ f->lft = f->rgt = list[i].diagonal;
+ f = b;
+ inserted = 1;
+ } else
+ f = b;
+ }
+ if (! inserted) {
+ if (free == NULL) {
+ free = (Trapezoid *)malloc(sizeof(Trapezoid));
+ if (free == NULL)
+ OutOfMemory("Trapezoid scan list");
+ free->next = NULL;
+ }
+ if (f == NULL)
+ f = traporder = free;
+ else
+ f = f->next = free;
+ free = f->next;
+ f->next = b;
+ f->top = list[i].bfinish;
+ f->bot = list[i].bstart;
+ f->lft = f->rgt = list[i].diagonal;
+ }
+ }
+
+ for (b = traporder; b != NULL; b = t) {
+ t = b->next;
+ trapcount += 1;
+ traparea += (b->top - b->bot + 1) * (b->rgt - b->lft + 1);
+ b->next = traplist;
+ traplist = b;
+ }
+
+ {
+ int lag, lst, lclip;
+ int abot, atop;
+
+ for (b = traplist; b != NULL; b = b->next) {
+ lag = (b->bot-MAXIGAP)+1;
+ if (lag < 0) lag = 0;
+ lst = b->top+MAXIGAP;
+ if (lst > Blen)
+ lst = Blen;
+
+ for (i = lag; i < lst; i++) {
+ if (Map[(int) (B[i])] >= 0) {
+ if (i-lag >= MAXIGAP) {
+ if (lag - b->bot > 0) {
+ if (free == NULL) {
+ free = (Trapezoid *)malloc(sizeof(Trapezoid));
+ if (free == NULL)
+ OutOfMemory("Trapezoid cutter");
+ free->next = NULL;
+ }
+ t = free->next;
+ *free = *b;
+ b->next = free;
+ free = t;
+ b->top = lag;
+ b = b->next;
+ b->bot = i;
+ trapcount += 1;
+ }
+ else
+ b->bot = i;
+ }
+ lag = i+1;
+ }
+ }
+ if (i-lag >= MAXIGAP)
+ b->top = lag;
+ }
+
+ tailend = NULL;
+ for (b = traplist; b != NULL; b = b->next) {
+ if (b->top - b->bot < KMERLEN) continue;
+
+ abot = b->bot - b->rgt;
+ atop = b->top - b->lft;
+
+
+ lag = (abot - MAXIGAP) + 1;
+ if (lag < 0) lag = 0;
+ lst = atop + MAXIGAP;
+ if (lst > Alen) lst = Alen;
+
+ lclip = abot;
+ for (i = lag; i < lst; i++) {
+ if (Map[(int) (A[i])] >= 0) {
+ if (i-lag >= MAXIGAP) {
+ if (lag > lclip) {
+ if (free == NULL) {
+ free = (Trapezoid *)malloc(sizeof(Trapezoid));
+ if (free == NULL)
+ OutOfMemory("Trapezoid cutter");
+ free->next = NULL;
+ }
+ t = free->next;
+ *free = *b;
+ b->next = free;
+ free = t;
+
+ {
+ int x, m;
+ x = lclip + b->lft;
+ if (b->bot < x)
+ b->bot = x;
+ x = lag + b->rgt;
+ if (b->top > x)
+ b->top = x;
+ m = (b->bot + b->top) / 2;
+ x = m - lag;
+ if (b->lft < x)
+ b->lft = x;
+ x = m - lclip;
+ if (b->rgt > x)
+ b->rgt = x;
+ }
+
+ b = b->next;
+ trapcount += 1;
+ }
+ lclip = i;
+ }
+ lag = i+1;
+ }
+ }
+
+ if (i-lag < MAXIGAP)
+ lag = atop;
+
+ {
+ int x, m;
+ x = lclip + b->lft;
+ if (b->bot < x)
+ b->bot = x;
+ x = lag + b->rgt;
+ if (b->top > x)
+ b->top = x;
+ m = (b->bot + b->top) / 2;
+ x = m - lag;
+ if (b->lft < x)
+ b->lft = x;
+ x = m - lclip;
+ if (b->rgt > x)
+ b->rgt = x;
+ }
+
+ tailend = b;
+ }
+ }
+
+ if (tailend != NULL) {
+ tailend->next = free;
+ free = traplist;
+ }
+
+ *Traplen = trapcount;
+ return (traplist);
+}
+
+
+/*** FINDING ALIGNMENTS WITHIN A TRAPEZOIDAL ZONE ***/
+
+static int TSORT(const void *l, const void *r) {
+ Trapezoid *x, *y;
+ x = *((Trapezoid **) l);
+ y = *((Trapezoid **) r);
+ return (x->bot - y->bot);
+}
+
+static int StSORT(const void *l, const void *r) {
+ Local_Segment *x, *y;
+ x = (Local_Segment *) l;
+ y = (Local_Segment *) r;
+ if (x->abpos < y->abpos)
+ return (-1);
+ else if (x->abpos > y->abpos)
+ return (1);
+ else
+ return (x->bbpos - y->bbpos);
+}
+
+static int FnSORT(const void *l, const void *r) {
+ Local_Segment *x, *y;
+ x = (Local_Segment *) l;
+ y = (Local_Segment *) r;
+ if (x->aepos < y->aepos)
+ return (-1);
+ else if (x->aepos > y->aepos)
+ return (1);
+ else
+ return (x->bepos - y->bepos);
+}
+
+static Trapezoid **Tarray = NULL;
+static int *Covered;
+static Local_Segment *SegSols = NULL;
+static int SegMax = -1;
+static int NumSegs;
+
+static void Align_Recursion(char const * const A, int const Alen,
+ char const * const B, int const Blen,
+ Trapezoid const * const b, int const current, int const comp,
+ int const MinLen, double const MaxDiff, int const Traplen) {
+ int j, mid, indel;
+ double pcnt;
+ Local_Segment *hend, *lend;
+ Trapezoid ltrp, htrp;
+
+ mid = (b->bot + b->top) / 2;
+
+ lend = TraceForwardPath(B,Blen,A,Alen,mid,mid-b->rgt,mid-b->lft);
+
+ {
+ int x = 0;
+
+ do {
+ x += 1;
+
+ hend = TraceReversePath(B,Blen,A,Alen,
+ lend->bepos,lend->aepos,lend->aepos,
+ mid+MAXIGAP,BLOCKCOST+2*x*diffcost);
+ } while (hend->bbpos > mid + x*MAXIGAP && hend->score < lend->score);
+
+ hend->aepos = lend->aepos;
+ hend->bepos = lend->bepos;
+ }
+
+
+ ltrp = htrp = *b;
+ ltrp.top = min(b->top,hend->bbpos) - MAXIGAP;
+
+
+ htrp.bot = max(b->bot,hend->bepos) + MAXIGAP;
+
+ if (hend->bepos - hend->bbpos >= MinLen &&
+ hend->aepos - hend->abpos >= MinLen) {
+
+ indel = abs( (hend->abpos - hend->bbpos)
+ - (hend->aepos - hend->bepos) );
+
+ pcnt = (-hend->score+samecost*(hend->bepos-hend->bbpos))*1./
+ (1.*(MATCHCOST)*(hend->bepos-hend->bbpos));
+
+ if (pcnt <= MaxDiff) {
+ hend->error = pcnt;
+
+ for (j = current+1; j < Traplen; j++)
+ { Trapezoid *t;
+ int ta, tb, ua, ub;
+
+ t = Tarray[j];
+ if (t->bot >= hend->bepos) break;
+
+ tb = t->top - t->bot + 1;
+ ta = t->rgt - t->lft + 1;
+ if (t->lft < hend->ldiag)
+ ua = hend->ldiag;
+ else
+ ua = t->lft;
+ if (t->rgt > hend->hdiag)
+ ub = hend->hdiag;
+ else
+ ub = t->rgt;
+
+ if (ua > ub) continue;
+
+ ua = ub - ua + 1;
+ if (t->top > hend->bepos)
+ ub = hend->bepos - t->bot + 1;
+ else
+ ub = tb;
+
+ if (((1.*ua)/ta)*((1.*ub)/tb) > .99)
+ Covered[j] = 1;
+ }
+
+ if (NumSegs >= SegMax)
+ { SegMax = (int)(1.2*NumSegs) + 500;
+ SegSols = (Local_Segment *) realloc(SegSols,
+ sizeof(Local_Segment)*SegMax);
+ if (SegSols == NULL)
+ OutOfMemory("Segment Alignment array");
+ }
+
+ { int d;
+
+ d = hend->hdiag; /* Oops, diags to this point are b-a, not a-b. */
+ hend->hdiag = - (hend->ldiag);
+ hend->ldiag = - d;
+ if (comp)
+ { hend->bbpos = Blen - hend->bbpos;
+ hend->bepos = Blen - hend->bepos;
+ hend->ldiag = Blen + hend->ldiag;
+ hend->hdiag = Blen + hend->hdiag;
+ }
+ }
+
+ SegSols[NumSegs++] = *hend;
+ }
+ }
+
+ if (ltrp.top - ltrp.bot > MinLen && ltrp.top < b->top - MAXIGAP)
+ Align_Recursion(A,Alen,B,Blen,<rp,current,comp,MinLen,MaxDiff,Traplen);
+
+ if (htrp.top - htrp.bot > MinLen)
+ Align_Recursion(A,Alen,B,Blen,&htrp,current,comp,MinLen,MaxDiff,Traplen);
+}
+
+
+
+static Local_Segment *Align_Trapezoids(char const * const A, int const Alen,
+ char const * const B, int const Blen,
+ Trapezoid const * const Traplist, int const Traplen,
+ int const start, int const comp,
+ int const MinLen, double const MaxDiff, int * const Seglen) {
+ static int fseg;
+ static int TarMax = -1;
+
+ if (Traplen >= TarMax) {
+ TarMax = (int)(1.2*Traplen) + 500;
+ Tarray = (Trapezoid **)
+ realloc(Tarray,(sizeof(Trapezoid *) + sizeof(int))*TarMax);
+ if (Tarray == NULL)
+ OutOfMemory("Trapezoid array");
+ Covered = (int *) (Tarray + TarMax);
+ }
+ if (SegMax < 0) {
+ SegMax = 1000;
+ SegSols = (Local_Segment *) malloc(sizeof(Local_Segment)*SegMax);
+ if (SegSols == NULL)
+ OutOfMemory("Segment Alignment array");
+ }
+
+ {
+ Trapezoid * b = (Trapezoid *)Traplist;
+ int i;
+ for (i = 0; i < Traplen; i++) {
+ Tarray[i] = b;
+ Covered[i] = 0;
+ b = b->next;
+ }
+ }
+
+ qsort(Tarray,Traplen,sizeof(Trapezoid *),TSORT);
+
+ if (start)
+ NumSegs = 0;
+ fseg = NumSegs;
+ {
+ int i;
+ for (i = 0; i < Traplen; i++)
+ if (! Covered[i]) {
+ Trapezoid * b = Tarray[i];
+ if (b->top - b->bot < KMERLEN) continue;
+ //printf("Trying hit %d\n",i);
+ Align_Recursion(A,Alen,B,Blen,b,i,comp,MinLen,MaxDiff,Traplen);
+ }
+ }
+
+ if (NumSegs > fseg) {
+ int i;
+ int j=0;
+ qsort(SegSols+fseg,NumSegs-fseg,sizeof(Local_Segment),StSORT);
+ assert(j==0);
+ for (i = fseg; i < NumSegs; i = j) {
+ for (j = i+1; j < NumSegs; j++) {
+
+ if (SegSols[j].abpos != SegSols[i].abpos) break;
+ if (SegSols[j].bbpos != SegSols[i].bbpos) break;
+ if (/* segments in opposite orientations */
+ ((SegSols[j].bepos-SegSols[j].bbpos) > 0 &&
+ (SegSols[i].bepos-SegSols[i].bbpos) < 0 )
+ ||
+ ((SegSols[j].bepos-SegSols[j].bbpos) < 0 &&
+ (SegSols[i].bepos-SegSols[i].bbpos) > 0 ) )break;
+
+ if (SegSols[j].error <= MaxDiff &&
+ SegSols[i].error <= MaxDiff){
+
+ if (abs(SegSols[i].bepos-SegSols[i].bbpos)+abs(SegSols[i].aepos-SegSols[i].abpos) <
+ abs(SegSols[j].bepos-SegSols[j].bbpos)+abs(SegSols[j].aepos-SegSols[j].abpos)) {
+ SegSols[i].score=-1;i=j;
+ } else {
+ SegSols[j].score=-1;
+ }
+ } else {
+ if(SegSols[j].error<=MaxDiff){
+ SegSols[i].score=-1;
+ i=j;
+ } else {
+ SegSols[j].score=-1;
+ }
+ }
+ }
+ }
+
+ qsort(SegSols+fseg,NumSegs-fseg,sizeof(Local_Segment),FnSORT);
+ for ( i = fseg; i < NumSegs; i = j) {
+ for (j = i+1; j < NumSegs; j++) {
+ if (SegSols[j].abpos != SegSols[i].abpos) break;
+ if (SegSols[j].bbpos != SegSols[i].bbpos) break;
+ if (SegSols[j].score > SegSols[i].score) {
+ SegSols[i].score = -1;
+ i = j;
+ } else
+ SegSols[j].score = -1;
+ }
+ }
+
+ for (i = fseg; i < NumSegs; i++)
+ if (SegSols[i].score >= 0)
+ SegSols[fseg++] = SegSols[i];
+ NumSegs = fseg;
+ }
+
+ *Seglen = NumSegs;
+ return (SegSols);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+/*** MASTER ROUTINE ***/
+
+Local_Segment *Find_Local_Segments(char const * const A, int const Alen,
+ char const * const B, int const Blen,
+ int const Action,
+ int const MinLen, double const MaxDiff, int * const Seglen) {
+ static int DagMax = -1;
+ static int BseqLen = -1;
+
+ int numhit = 0;
+ HitRecord *hits = 0L;
+ int numtrap = 0;
+ Trapezoid *traps = 0L;
+ int numseg = 0;
+ Local_Segment *segs = 0L;
+
+
+ // Defining this causes scoring to be set so that nearly all
+ // extensions terminate within the user-defined error rate; this
+ // means we don't completely miss a high-quality segment that is
+ // part of a larger segment of lower quality--e.g., we could find
+ // more-conserved portions of a repeat even if the repeat as a
+ // whole was lower fidelity, due to varying selectional pressure at
+ // different positions; the down-side to this is that we won't
+ // chain together perfect matches across a few bases of
+ // lower-fidelity sequence. Leaving it undefined uses the #define
+ // values of DIFFCOST and SAMECOST
+ //
+ samecost = (int)ceil(100.0 * MaxDiff);
+ diffcost = 100 - samecost;
+ MATCHCOST = samecost + diffcost;
+ BLOCKCOST = diffcost * MAXIGAP;
+
+ if (Action != LOCAL_FORW) {
+ if(BseqLen<Blen){
+ BrevC = (char*) realloc(BrevC,sizeof(char)*(Blen+1));
+ BseqLen=Blen;
+ if(BrevC==NULL){OutOfMemory("B sequence reverse complement");}
+ }
+ strcpy(BrevC,B);
+ Complement(BrevC,Blen);
+ }
+
+ if (Alen >= DagMax || Blen >= DagMax) {
+ if (Kmask < 0) {
+ int i;
+ for (i = 0; i < 128; i++)
+ Map[i] = -1;
+ Map[(int)'a'] = Map[(int)'A'] = 0;
+ Map[(int)'c'] = Map[(int)'C'] = 1;
+ Map[(int)'g'] = Map[(int)'G'] = 2;
+ Map[(int)'t'] = Map[(int)'T'] = 3;
+ Kmask = (1 << (2*KMERLEN)) - 1;
+ Table = (int *)malloc(sizeof(int)*(Kmask+2));
+ if (Table == NULL)
+ OutOfMemory("K-mer index");
+ }
+ if (Alen > Blen)
+ DagMax = (int)(1.2*Alen) + 5000;
+ else
+ DagMax = (int)(1.2*Blen) + 5000;
+ DagMax += sizeof(DiagRecord) - (DagMax % sizeof(DiagRecord));
+ Tuples = (int *)realloc(Tuples,sizeof(int)*DagMax +
+ sizeof(DiagRecord)*(2*DagMax+MAXERROR+1));
+ if (Tuples == NULL)
+ OutOfMemory("K-mer index");
+ DiagVec = ((DiagRecord *) (Tuples + DagMax)) + (DagMax+1);
+ }
+
+ TableBuild(A,Alen);
+
+ int start = 1;
+
+ if (Action != LOCAL_REVR) {
+ hits = Find_Hits(A,Alen,B,Blen,&numhit);
+ traps = Build_Trapezoids(A,Alen,B,Blen,hits,numhit,&numtrap);
+ segs = Align_Trapezoids(A,Alen,B,Blen,traps,numtrap,
+ start,0,MinLen,MaxDiff,&numseg);
+ start = 0;
+ }
+
+ if (Action != LOCAL_FORW) {
+ hits = Find_Hits(A,Alen,BrevC,Blen,&numhit);
+ traps = Build_Trapezoids(A,Alen,BrevC,Blen,hits,numhit,&numtrap);
+ segs = Align_Trapezoids(A,Alen,BrevC,Blen,traps,numtrap,
+ start,1,MinLen,MaxDiff,&numseg);
+ }
+
+ *Seglen = numseg;
+ return (segs);
+}
diff --git a/atac-driver/chainer/localalign/GF_ALN_local.H b/atac-driver/chainer/localalign/GF_ALN_local.H
new file mode 100644
index 0000000..04a61e7
--- /dev/null
+++ b/atac-driver/chainer/localalign/GF_ALN_local.H
@@ -0,0 +1,274 @@
+/**************************************************************************
+ * This file is part of A2Amapper.
+ * Copyright (c) 2004 Applera Corporation
+ * Author: Clark Mobarry
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received (LICENSE.txt) a copy of the GNU General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+**************************************************************************/
+
+
+#ifndef CA_ALN_local_h
+#define CA_ALN_local_h
+
+#include <stdio.h>
+
+
+inline
+void *
+ckalloc(size_t size) {
+ void *newp = NULL;
+ assert(size>0);
+ assert(NULL != (newp = malloc(size)));
+ return(newp);
+}
+
+inline
+void *
+ckrealloc(void* ptr, size_t size) {
+ void *newp = NULL;
+ assert(size>0);
+ assert(NULL != (newp = realloc(ptr, size)));
+ return(newp);
+}
+
+
+
+
+
+
+/* O(kn) identity-based alignment algorithm. Find alignment between
+ a and b (of lengths alen and blen), that begins at finishing
+ boundary position *spnt. Return at *spnt the diagonal at which the
+ alignment starts.
+*/
+int *AS_ALN_OKNAlign(const char *a, int alen,
+ const char *b, int blen,
+ int *spnt,
+ int diff);
+
+
+
+/* Local alignment record:
+ Coordinates are in terms of the d.p. matrix that go from (0,0)
+ to (|A|,|B|), where A is the A-sequence argument and B is the
+ B-sequence argument. A coordinate is a position between chars
+ of the sequence. For example, an alignment from (3,5) to (6,9)
+ aligns characters 4-5 of A with characters 7-9 of B.
+
+ If the B start coordinate is greater than the B end coordinate
+ then the alignment is of A versus the complement of B, i.e. the
+ alignment runs along an anti-diagonal, not a diagonal.
+
+ Diagonal k is the set of coordinates (a,b) s.t. a-b = k.
+ Anti-diagonal k is the set of coordinates (a,b) s.t. a+b = k.
+*/
+
+typedef struct {
+ int abpos, bbpos; /* Start coordinate of local alignment */
+ int aepos, bepos; /* End coordinate of local alignment */
+ int ldiag, hdiag; /* Alignment is between (anti)diagonals ldiag & hdiag */
+ int score; /* Score of alignment where match = 1, difference = -3 */
+ double error; /* Lower bound on error rate of match */
+} Local_Segment;
+
+#define LOCAL_FORW 0 /* Compare A to B only */
+#define LOCAL_REVR 1 /* Compare A to complement(B) only */
+#define LOCAL_BOTH 2 /* Compare A to both B and its complement */
+
+/* Find_Local_Segments compares sequence A of length Alen against sequence B
+ of length Blen, in the same, opposite, or both orientations depending
+ on the setting of Action, and returns a pointer to an array of local
+ alignment records, where the number of such records is in the integer
+ pointed at by NumSegs. Find_Local_Segments only reports alignments that are
+ longer than MinLen bps. and contain less than MaxDiff errors as a fraction
+ of the alignment's length. Find_Local_Segments reuses the storage for the
+ array of local alignment segments with each successive call, a user should
+ copy the array if they which for it to persist beyond a given invocation.
+
+ Find_Local_Segments finds alignments that contain at least 36bp that match
+ at 95% or better. The encompassing local alignment has to match at
+ about 75% or better as matching chars are scored SAMECOST (1) and
+ differences are scored -DIFFCOST (3). Extension of a local alignment
+ in a given direction ends at a cumulative maximum from which all
+ extensions drop by BLOCKCOST (15 = DIFFCOST(3)*MAXIGAP(5)) in score,
+ the equivalent of MAXIGAP(5) consecutive differences.
+
+ Find_Local_Segments is most efficiently applied to large sequences. Any
+ application that is doing an all-against-all of smaller fragment
+ sequences, would best utilize Find_Local_Segments by concatenating the
+ fragments, applying Find_Local_Segments to the large concatenations, and
+ then mapping the local alignments back to the fragments and
+ coordinates to which they pertain. By separating fragments in the
+ concatentaion by MAXIGAP+1(6) N's, one guarantees that local alignments
+ do not span the boundaries between fragment sequences.
+
+ Find_Local_Segments builds an index of the A sequence as part of its
+ acceleration method. If successive calls to Find_Local_Segments involve
+ the same A-sequence, this table is built only once, improving
+ efficiency.
+*/
+
+Local_Segment *Find_Local_Segments
+(char const * const A, int const Alen,
+ char const * const B, int const Blen,
+ int const Action,
+ int const MinLen, double const MaxDiff, int * const Seglen);
+
+/* Local_Overlap Record:
+ A local overlap is a chain of Local_Segments computed by
+ Find_Local_Segments that when strung together form an overlap between
+ the two sequences involved. The Local_Overlap record contains the
+ the number of segments, a pointer to an ordered array of the segments
+ in the chain, and the following parameters:
+ score: The score of a chain is the total number of indels required
+ to build an overlap out of the chained elements.
+ begpos,endpos: As for DP_Compare, the diagonals on which the overlap
+ begins and ends.
+ diffs: The number of substitutions and indels required to build an
+ overlap out of the chained elements.
+ length: (|A-seg|+|B-seg|)/2.
+ The field chain points to an array of num_pieces+1 Local_Chain records.
+ Records 0..num_pieces encode the nature of the gap between segments so
+ that record 0 gives the gap to the start border (if any), record
+ num_pieces gives the gap to the finish border (if any), and record i
+ gives the gap between the segment of record i-1 and record i. Record
+ num_pieces does not contain a segment description. The parameters
+ agap and bgap give the delta in the A- and B-coordinates if of the
+ end of the previous segment and the start of the next one. The
+ coordinates can be negative and both can be zero only for the boundary
+ gaps (first and last). Each segment is identified by its position in
+ the array of segments passed to the routine, and if the segment was
+ complemented in order to form part of the chain, then and only then is
+ the field reversed set to a non-zero value. The type field gives an
+ indication of the type of the gap as follows:
+
+ LOCAL_BOUNDARY -- if both agap and bgap are zero at a boundary gap
+ then this indication is given.
+ LOCAL_MINOR -- if the a- and b-gaps are less than a user-supplied
+ limit "MinorThresh", then the gap is considered a
+ minor break between two segments of similarity.
+ LOCAL_INDEL -- if the gap in one sequence is minor, but major, positive,
+ and at least 4 times as large in the other, then the
+ gap is considered an indel.
+ LOCAL_REPEAT -- if the gap in one sequence is minor or negative, but
+ major and negative then the gap is considered a repeat
+ gap in the sense that a tandem repeat must occur in
+ one or both of the sequences around the junction between
+ the two adjacent segments.
+ LOCAL_REPnDEL -- if the gap in one sequence is major and negative, and
+ the other is major and positive, then there is a
+ repeated element on both sides of the sequence with
+ the inserted sequence.
+ LOCAL_DISAGREE -- anything else, i.e. both gap deltas are positive,
+ at least one is major, and if the other is minor
+ then the ration is less than 1 to 4.
+*/
+
+#define LOCAL_BOUNDARY 0x0 /* No gap, at boundary */
+#define LOCAL_MINOR 0x1 /* Small break in alignment */
+#define LOCAL_DISAGREE 0x2 /* The two sequences significantly disagree */
+#define LOCAL_INDEL 0x3 /* One sequence has missing/added sequence */
+#define LOCAL_REPEAT 0x4 /* A tandem repeat occurs at the junction */
+#define LOCAL_REPnDEL 0x5 /* Both a tandem repeat and an indel */
+
+typedef struct {
+ int agap, bgap; /* A- and B-seq deltas from last segment to this one */
+ short type; /* Type of gap as given by the defined cons. above */
+ short reversed; /* Is segment reversed for inclusion in chain */
+ Local_Segment piece; /* Segment in the chain */
+} Local_Chain;
+
+typedef struct {
+ int begpos; /* Entry diagonal of boundary point (a,b) on which
+ overlap starts, where diagonal = a - b. */
+ int endpos; /* Exit diagonal of boundary point (a,b) on which
+ overlap ends, where diagoanl = (|B|-b) - (|A|-a) */
+ int length; /* Length of overlap (|A|+|B|)/2 */
+ int diffs; /* Estimated number of differences in overlap */
+ int comp; /* B sequence was complemented for this comparison */
+ int indif; /* Estimated number of diffs in segments of overlap */
+ int score; /* Sum of all gap lengths */
+ int num_pieces; /* # of segments in overlap chain */
+ int next; /* for iteration through the chain - CMM */
+ Local_Chain *chain; /* chain[0..num_pieces] describe each gap between
+ local segments in the overlap chain */
+} Local_Overlap;
+
+
+
+
+
+
+
+
+/* Find_Local_Overlap takes an array of local alignments as returned by
+ Compare_Local and finds the best scoring local overlap between the
+ underlying sequences. One must pass in the length of the two sequences
+ from which Compare_Local produced the local alignments as well as the
+ number of local alignments in the array. If the parameter comp is nonzero
+ then the comparison will effectively be between A and the complement of B.
+ Normally, the parameter nextbest is zero -- after such a call, a second
+ alternate overlap, third alternate, and so on can be generated by
+ subsequent calls with nextbest set to a nonzero value. The alternates
+ are the best scoring overlaps that starts with a segment not in any
+ previous overlap. The best overlap is returned as a pointer to local
+ overlap structure described above. Unlike many of my routines, the
+ reclamation of the storage for this data structure is the responsibility
+ of the caller and requires simply calling free on it, as the entire
+ structure, including the chain array, is in a single memory block. The
+ parameter MinorThresh determines whether a gap delta is consider minor or
+ major (see the description above on gap types). An overlap is returned
+ only if the ratio of the difference to the length of the overlap is less
+ than GapThresh, otherwise NULL is returned.
+*/
+
+Local_Overlap *Find_Local_Overlap(int Alen, int Blen, int comp, int nextbest,
+ Local_Segment *Segs, int NumSegs,
+ int MinorThresh, double GapThresh);
+
+
+
+
+
+
+
+
+/* Create a trace to be interpreted as with DP_Compare_AS, but based
+ on a Local_Overlap record. A Local_Segment within the overlap
+ will be aligned using OKNAlign(), generating a subtrace. Subtraces,
+ with their indices appropriately adjusted, will be spliced together
+ by an encoding of the gaps between segments; for now, we'll simply insert
+ gaps as follows:
+
+ A "gap" with x bases in A and y bases in B will become a section
+ of the alignment x+y positions long, with the A fragment first
+ and the B fragment second:
+
+ AAAAAAAAAA--------------
+ ----------BBBBBBBBBBBBBB
+
+ Obviously, a more compact treatment is possible!
+
+ Assumptions: both sequences should be in the forward orientation
+ and all segments are forward.
+
+*/
+int *AS_Local_Trace(Local_Overlap *local_overlap,
+ const char *aseq,
+ const char *bseq);
+
+
+#endif
+
diff --git a/atac-driver/chainer/localalign/GF_ALN_loverlapper.C b/atac-driver/chainer/localalign/GF_ALN_loverlapper.C
new file mode 100644
index 0000000..9cdc891
--- /dev/null
+++ b/atac-driver/chainer/localalign/GF_ALN_loverlapper.C
@@ -0,0 +1,599 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Author: Clark Mobarry
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "GF_ALN_local.H"
+
+#define min(a,b) (a<b?a:b)
+#define max(a,b) (a>b?a:b)
+
+
+/* Handle Local_Overlap pieces (local segments) which overlap,
+ * by trimming them back until they abut,
+ * such that the number of mismatches is minimized
+ */
+void fix_overlapping_pieces(const char *aseq,
+ const char *bseq,
+ Local_Overlap *O,
+ int piece0,
+ int piece1);
+
+
+
+
+//maximum number of matching segments that can be pieced together
+int MaxGaps= 3;
+
+//maximum allowed mismatch at end of overlap
+int MaxBegGap= 200;
+
+//maximum allowed mismatch at end of overlap
+int MaxEndGap= 200;
+
+//biggest gap internal to overlap allowed
+int MaxInteriorGap=400;
+
+//whether to treat the beginning of the b fragment and
+// the end of the a fragment as allowed to have more error
+int asymmetricEnds=0;
+
+//amount of mismatch at end of the overlap that can cause
+// an overlap to be rejected
+int MaxFreeFlap=20;
+
+
+// set useSizeToOrderBlocks to 1 to get block mismatches resolved such that
+// the smaller block comes first:
+//
+// .........AA------.........
+// .........--BBBBBB........
+//
+// or set it to 0 to get resolution with the A block always first
+int useSizeToOrderBlocks = 1;
+
+
+
+/* Create a trace to be interpreted as with DP_Compare_AS, but based
+ on a Local_Overlap record. A Local_Segment within the overlap
+ will be aligned using AS_ALN_OKNAlign(), generating a subtrace. Subtraces,
+ with their indices appropriately adjusted, will be spliced together
+ by an encoding of the gaps between segments; for now, we'll simply insert
+ gaps as follows:
+
+ A "gap" with x bases in A and y bases in B will become a section
+ of the alignment x+y positions long, with the A fragment first
+ and the B fragment second (with '=' indicating an aligned match):
+
+ ====AAAAAAAAAA--------------======
+ ====----------BBBBBBBBBBBBBB======
+
+ Obviously, a more compact treatment is possible, but this makes clear
+ the presumptive blocks involved in the mismatch; also, Karin says that it
+ will make Consensus happy. One slight change
+
+ Assumptions: (do they matter?)
+ By the usual conventions, the ahang should be nonnegative, the
+ bhang negative only if the ahang is positive, and both sequences
+ should be in the forward orientation.
+
+*/
+
+int *AS_Local_Trace(Local_Overlap *O, const char *aseq, const char *bseq){
+ static int *TraceBuffer=NULL;
+ int i,j,k,segdiff,*segtrace;
+ int lastgood=-1;
+ static int allocatedspace=0;
+ int tracespace=0;
+ static char *aseg=NULL,*bseg=NULL;
+
+ // Not computing traces generates slight differences. Why?
+ //
+ const int computeTraceFlag = 0;
+
+ static int aseglen=0,bseglen=0;
+ int abeg=0,bbeg=0; /* begining of segment; overloaded */
+ int tracep=0; /* index into TraceBuffer */
+ int spnt=0; /* to pass to AS_ALN_OKNAlign */
+
+
+ {
+ int n=O->num_pieces;
+ assert(O->num_pieces>0);
+ O->chain[n].piece.abpos=O->chain[n].agap+O->chain[n-1].piece.aepos;
+ O->chain[n].piece.bbpos=O->chain[n].bgap+O->chain[n-1].piece.bepos;
+ }
+
+ if(computeTraceFlag){
+ /*Estimate length required to store trace*/
+ tracespace=0;
+ tracespace+=abs(O->begpos)+abs(O->endpos);
+ for(i=0;i<=O->num_pieces;i++){
+ tracespace+=max(O->chain[i].agap,1);
+ tracespace+=max(O->chain[i].bgap,1);
+ tracespace+=(int)((O->chain[i].piece.aepos
+ -O->chain[i].piece.abpos)
+ *1.5*O->chain[i].piece.error);
+ tracespace+=1000;
+ }
+
+ /*(Re)allocate space for the trace as necessary;
+ Note that this is persistent storage so ...
+ ... it doesn't need to get allocated on every call
+ ... it shouldn't get freed
+ ... results stored here need to be copied elsewhere if they
+ are to be saved
+ */
+ if(allocatedspace<tracespace){
+ allocatedspace=2*tracespace;
+ if(TraceBuffer==NULL){
+ TraceBuffer=(int*)ckalloc(sizeof(int)*allocatedspace);
+ } else {
+ TraceBuffer=(int*)ckrealloc(TraceBuffer,sizeof(int)*allocatedspace);
+ }
+ }
+ } // computeTraceFlag
+
+ /* for each Local_Overlap chain[i].piece,
+ need to handle the gap at the beginning and
+ (for all but the final piece) the associated aligned segment */
+ for(i=0;i<=O->num_pieces;i++){
+
+ /* if conditions indicate the segment was deleted in previous loop,
+ skip! */
+ if(O->chain[i].agap==0 &&
+ O->chain[i].bgap==0 &&
+ O->chain[i].piece.abpos==O->chain[i].piece.aepos &&
+ O->chain[i].piece.bbpos==O->chain[i].piece.bepos){
+ continue;
+ }
+
+ /* guesstimate the required number of diagonals/edits to consider to
+ get optimal alignment */
+ segdiff=1+(int)((O->chain[i].piece.aepos
+ -O->chain[i].piece.abpos)
+ *1.5*O->chain[i].piece.error);
+
+
+
+ /* Building an alignment/trace under the usual assumptions does not allow
+ a given position in one sequence to simultaneously align to two or more
+ positions in the other sequence. However, the Find_Local_Overlap()
+ routine can chain together local alignment segments that overlap.
+
+ In order to make the local overlaps compatible with everything else
+ we do, we need to trim back the overlaps.
+
+ Since we will "output" this segment at the end of the loop,
+ we need to fix its overlap with the following segment in this
+ cycle through the loop
+ */
+
+ k=i+1;
+ while(k<O->num_pieces){
+
+ /* if conditions indicate the segment was deleted previously, skip! */
+ if(O->chain[k].agap==0 &&
+ O->chain[k].bgap==0 &&
+ O->chain[k].piece.abpos==O->chain[k].piece.aepos &&
+ O->chain[k].piece.bbpos==O->chain[k].piece.bepos){
+ k++;
+ continue;
+ }
+
+ if(O->chain[k].piece.abpos<O->chain[i].piece.aepos||
+ O->chain[k].piece.bbpos<O->chain[i].piece.bepos){
+
+ /* handle possibility of the first segment being
+ contained within the second;originally simply asserted
+ against this; now, try to handle by deleting first segment */
+
+ if(O->chain[i].piece.abpos>O->chain[k].piece.abpos||
+ O->chain[i].piece.bbpos>O->chain[k].piece.bbpos){
+
+
+ O->chain[i].agap=0;
+ O->chain[i].bgap=0;
+ if(lastgood>=0){
+ O->chain[i].piece.abpos=O->chain[lastgood].piece.aepos;
+ O->chain[i].piece.aepos=O->chain[lastgood].piece.aepos;
+ O->chain[i].piece.bbpos=O->chain[lastgood].piece.bepos;
+ O->chain[i].piece.bepos=O->chain[lastgood].piece.bepos;
+ } else {
+ O->chain[i].piece.abpos=0;
+ O->chain[i].piece.aepos=0;
+ O->chain[i].piece.bbpos=0;
+ O->chain[i].piece.bepos=0;
+ }
+ O->chain[k].agap=O->chain[k].piece.abpos-
+ O->chain[i].piece.aepos;
+ O->chain[k].bgap=O->chain[k].piece.bbpos-
+ O->chain[i].piece.bepos;
+ if(lastgood<0){
+ //printf("Shrinking gaps for segment %d\n",k);
+ O->chain[k].agap--;
+ O->chain[k].bgap--;
+ }
+
+ } else /* otherwise, check for 2nd piece contained within first */
+ if(O->chain[i].piece.aepos>O->chain[k].piece.aepos||
+ O->chain[i].piece.bepos>O->chain[k].piece.bepos){
+
+ /* if the next piece is completely within current piece,
+ effectively remove it */
+
+ O->chain[k].agap = 0;
+ O->chain[k].bgap = 0;
+ O->chain[k].piece.abpos=O->chain[i].piece.aepos;
+ O->chain[k].piece.aepos=O->chain[i].piece.aepos;
+ O->chain[k].piece.bbpos=O->chain[i].piece.bepos;
+ O->chain[k].piece.bepos=O->chain[i].piece.bepos;
+ if(k+1<=O->num_pieces){
+ int l;
+ l=k-1;
+ while(O->chain[l].agap==0 &&
+ O->chain[l].bgap==0 &&
+ O->chain[l].piece.abpos==O->chain[l].piece.aepos &&
+ O->chain[l].piece.bbpos==O->chain[l].piece.bepos){
+ l--;
+ assert(l>=0);
+ }
+
+
+ O->chain[k+1].agap=O->chain[k+1].piece.abpos-
+ O->chain[l].piece.aepos;
+ O->chain[k+1].bgap=O->chain[k+1].piece.bbpos-
+ O->chain[l].piece.bepos;
+ }
+
+ /* else, fix the overlap */
+ } else {
+
+ fix_overlapping_pieces(aseq,
+ bseq,
+ O,i,k);
+
+
+ // if the second piece disappeared
+ if(O->chain[k].piece.abpos==O->chain[k].piece.aepos||
+ O->chain[k].piece.bbpos==O->chain[k].piece.bepos){
+
+ O->chain[k].agap = 0;
+ O->chain[k].bgap = 0;
+ O->chain[k].piece.abpos=O->chain[i].piece.aepos;
+ O->chain[k].piece.aepos=O->chain[i].piece.aepos;
+ O->chain[k].piece.bbpos=O->chain[i].piece.bepos;
+ O->chain[k].piece.bepos=O->chain[i].piece.bepos;
+ if(k+1<=O->num_pieces){
+ int l;
+ l=k-1;
+ while(O->chain[l].agap==0 &&
+ O->chain[l].bgap==0 &&
+ O->chain[l].piece.abpos==O->chain[l].piece.aepos &&
+ O->chain[l].piece.bbpos==O->chain[l].piece.bepos){
+ l--;
+ assert(l>=0);
+ }
+
+ O->chain[k+1].agap=O->chain[k+1].piece.abpos-
+ O->chain[l].piece.aepos;
+ O->chain[k+1].bgap=O->chain[k+1].piece.bbpos-
+ O->chain[l].piece.bepos;
+ }
+ } else {
+ // if the first piece disappeared
+ if (O->chain[i].piece.abpos==O->chain[i].piece.aepos||
+ O->chain[i].piece.bbpos==O->chain[i].piece.bepos){
+
+ O->chain[i].agap=0;
+ O->chain[i].bgap=0;
+ if(lastgood>=0){
+ O->chain[i].piece.abpos=O->chain[lastgood].piece.aepos;
+ O->chain[i].piece.aepos=O->chain[lastgood].piece.aepos;
+ O->chain[i].piece.bbpos=O->chain[lastgood].piece.bepos;
+ O->chain[i].piece.bepos=O->chain[lastgood].piece.bepos;
+ } else {
+ O->chain[i].piece.abpos=0;
+ O->chain[i].piece.aepos=0;
+ O->chain[i].piece.bbpos=0;
+ O->chain[i].piece.bepos=0;
+ }
+ O->chain[k].agap=O->chain[k].piece.abpos-
+ O->chain[i].piece.aepos;
+ O->chain[k].bgap=O->chain[k].piece.bbpos-
+ O->chain[i].piece.bepos;
+ if(lastgood<0){
+ //printf("Shrinking gaps for segment %d\n",k);
+ O->chain[k].agap--;
+ O->chain[k].bgap--;
+ }
+ }
+ }
+ }
+ }
+
+ k++;
+ }
+
+ /* if conditions indicate the segment was deleted previously, skip! */
+ if(O->chain[i].agap==0 &&
+ O->chain[i].bgap==0 &&
+ O->chain[i].piece.abpos==O->chain[i].piece.aepos &&
+ O->chain[i].piece.bbpos==O->chain[i].piece.bepos){
+ continue;
+ }
+
+ /* set up positions before which gaps are inserted to handle
+ the gap portion of a chain piece */
+
+ /* put gaps before beginning of aligned piece (but after the portion
+ of aseq in the gap); location is relative to the beginning of the
+ alignment (i.e., ignores ahang worth of positions) */
+
+ if(i!=O->num_pieces){
+ abeg=O->chain[i].piece.abpos;
+ } else {
+ assert(lastgood>=0&&lastgood<O->num_pieces);
+ abeg=O->chain[lastgood].piece.aepos+
+ O->chain[i].agap;
+ }
+
+ /*handle boundary case to prevent gaps preceding the b sequence*/
+ if((i==0||lastgood<0)&&O->chain[i].bgap>0){
+ assert(O->chain[i].agap>=0);
+
+ if(O->begpos>=0){
+ O->begpos=O->chain[i].piece.abpos-1;
+ assert(O->begpos>=0);
+ O->chain[i].agap=0;
+
+ // Instead of asserting, an ifdef previously printed stuff
+ // out and continued happily along.
+ //
+ assert( ( i==0&& O->chain[i].bgap==O->chain[i].piece.bbpos-1)
+ ||( i>0&&lastgood<0&&O->chain[i].bgap==O->chain[i].piece.bbpos-1)) ;
+
+ if(lastgood<0){
+ O->chain[i].bgap=O->chain[i].piece.bbpos-1;
+ }
+ } else {
+ if(i==0){
+ O->begpos-=O->chain[i].bgap;
+ } else{
+ O->begpos=-O->chain[i].bgap;
+ }
+ O->chain[i].bgap=0;
+ }
+
+ }
+
+ /* now prevent gaps at end of A sequence */
+ // if(i==O->num_pieces&&O->endpos>=0){
+ // O->endpos+=O->chain[i].bgap;
+ // O->chain[i].bgap=0;
+ // }
+
+ /* now make sure that end mismatches are treated by tucking
+ the shorter tail into a gap before the longer tail,
+ or, ifdef FORCEPOSITIVEBHANG, by tucking the A tail into
+ a gap before the B tail */
+ if(i==O->num_pieces){
+ if(O->endpos>=0){
+ O->endpos+=O->chain[i].bgap;
+ O->chain[i].bgap=0;
+ }else {
+ O->endpos-=O->chain[i].agap;
+ abeg-=O->chain[i].agap;
+ O->chain[i].agap=0;
+ }
+ }
+
+ /* put gaps before the portion of bseq in the gap; for the first
+ piece, this means before position 0 */
+ if(i==0 || lastgood<0){
+ bbeg = 1-min(O->begpos,0);
+ } else {
+ assert(lastgood<O->num_pieces);
+ bbeg = O->chain[lastgood].piece.bepos;
+ }
+
+ /* now insert the right number of gaps! */
+
+ if(i==O->num_pieces){
+
+ if(O->endpos<0){
+ O->chain[i].agap+=-O->endpos;
+ O->endpos=0;
+ } else {
+ O->chain[i].bgap+=O->endpos;
+ O->endpos=0;
+ }
+
+ if(O->chain[i].agap <= O->chain[i].bgap){
+ O->endpos=O->chain[i].bgap;
+ O->chain[i].bgap=0;
+ }else{
+ O->endpos=-O->chain[i].agap;
+ O->chain[i].agap=0;
+ }
+ }
+
+
+ if (computeTraceFlag) {
+ if(O->chain[i].agap <= O->chain[i].bgap || ! useSizeToOrderBlocks ){
+ /* start by putting len(agap) gaps before the chunk of B in the gap */
+ for(j=0; j<O->chain[i].agap ;j++)
+ TraceBuffer[tracep++]=bbeg;
+
+ /* then put len(bgap) gaps before the chunk of A in the gap */
+ for(j=0; j<O->chain[i].bgap ;j++)
+ TraceBuffer[tracep++]=-abeg;
+ } else { // if the bgap is smaller,
+ abeg-=O->chain[i].agap;
+ bbeg+=O->chain[i].bgap;
+
+ /* start by putting len(bgap) gaps before the chunk of A in the gap */
+ for(j=0;j<O->chain[i].bgap ;j++)
+ TraceBuffer[tracep++]=-abeg;
+
+ /* then put len(agap) gaps before the chunk of B in the gap */
+ for(j=0;j<O->chain[i].agap ;j++)
+ TraceBuffer[tracep++]=bbeg;
+ }
+ } else {
+ // Not computing traces!
+ if(O->chain[i].agap <= O->chain[i].bgap || ! useSizeToOrderBlocks ){
+ } else {
+ abeg-=O->chain[i].agap;
+ bbeg+=O->chain[i].bgap;
+ }
+ }
+
+
+ ///////////////////////////////////////
+
+ /* if last piece, there is no aligned segment */
+
+ if(i==O->num_pieces)break;
+
+ /* set bbeg to beginning of aligned segment for piece */
+
+ abeg=O->chain[i].piece.abpos;
+ bbeg=O->chain[i].piece.bbpos;
+
+ /* set lengths of segments */
+
+ int alen=O->chain[i].piece.aepos-abeg; /* check +1?? */
+ int blen=O->chain[i].piece.bepos-bbeg; /* check +1?? */
+
+ /* create strings for just the parts of the sequences in the
+ aligned segment */
+
+ /* make sure there is (persistant) space for the strings */
+ if(aseglen<alen+1){
+ aseglen=2*(alen+1);
+ if(aseg==NULL){
+ aseg=(char*)ckalloc(sizeof(char)*aseglen);
+ } else {
+ aseg=(char*)ckrealloc(aseg,sizeof(char)*aseglen);
+ }
+ }
+ if(bseglen<blen+1){
+ bseglen=2*(blen+1);
+ if(bseg==NULL){
+ bseg=(char*)ckalloc(sizeof(char)*bseglen);
+ } else {
+ bseg=(char*)ckrealloc(bseg,sizeof(char)*bseglen);
+ }
+ }
+
+ /* CMM we do not need the trace computed */
+ if (computeTraceFlag) {
+
+ /* copy the segments */
+
+ strncpy(aseg, aseq+abeg, alen);
+ strncpy(bseg, bseq+bbeg, blen);
+
+ aseg[alen] = 0;
+ bseg[blen] = 0;
+
+ if (((int)strlen(bseg) != blen) ||
+ ((int)strlen(aseg) != alen)) {
+ fprintf(stderr,"EXCEPTION strlen(aseg)=%d alen=%d abeg=%d\n", (int)strlen(aseg), alen, abeg);
+ fprintf(stderr,"EXCEPTION strlen(bseg)=%d blen=%d bbeg=%d\n", (int)strlen(bseg), blen, bbeg);
+
+ fprintf(stderr,"EXCEPTION aseg=<%s>\n", aseg);
+ fprintf(stderr,"EXCEPTION bseg=<%s>\n", bseg);
+
+ fprintf(stderr,"EXCEPTION aseq=<%s>\n", aseq + 1);
+ fprintf(stderr,"EXCEPTION bseq=<%s>\n", bseq + 1);
+
+ return NULL; // Return an exceptional value.
+ }
+
+ /* guesstimate the required number of diagonals/edits to consider to
+ get optimal alignment */
+ segdiff = 1 + (int)((O->chain[i].piece.aepos - O->chain[i].piece.abpos) * 1.5 * O->chain[i].piece.error);
+
+
+ /* get trace for the segment from AS_ALN_OKNAlign */
+ spnt=0;
+ /* subtract from aseg, bseg because Gene likes to index from 1, not 0 */
+ segtrace=AS_ALN_OKNAlign(aseg-1,alen,bseg-1,blen,&spnt,segdiff);
+ // This adjusts the beginning coordinates so that segment is
+ // consistent with the back-trace.
+
+ if(spnt>0){
+ O->chain[i].agap+=spnt;
+ O->chain[i].piece.abpos+=spnt;
+ } else {
+ O->chain[i].bgap-=spnt;
+ O->chain[i].piece.bbpos-=spnt;
+ }
+
+ /* get trace for the segment from AS_ALN_OKNAffine */
+ /* Seems like it should be a good idea, but doesn't work as well
+ as we might expect! */
+ //bpnt=0;
+ //epnt=0;
+ //segtrace=AS_ALN_OKNAffine(aseg,alen,bseg,blen,&bpnt,&epnt,segdiff);
+
+ assert(segtrace!=NULL);
+
+ /* Now copy the segment trace into master trace, adjusting positions */
+ j=0;
+
+ if(spnt<0){
+ for(int ctr=0;ctr<abs(spnt);ctr++)
+ TraceBuffer[tracep++]=-abeg;
+ } else {
+ for(int ctr=0;ctr<spnt;ctr++)
+ TraceBuffer[tracep++]=bbeg;
+ }
+
+ while(segtrace[j]!=0){
+ if(segtrace[j]<0){
+ TraceBuffer[tracep++]=-abeg+segtrace[j++]+1 /* -max(0,spnt) ?? */;
+ } else {
+ TraceBuffer[tracep++]=bbeg+segtrace[j++]-1 /* +max(0,-spnt) ?? */;
+ }
+ }
+ } // computeTraceFlag
+
+ /* set lastgood to this segment */
+
+ lastgood=i;
+
+ /* and back to the top of the loop for another overlap piece */
+ }
+
+ /* terminate the trace */
+ if(TraceBuffer != NULL) {
+ TraceBuffer[tracep]=0;
+
+ if (tracep >= allocatedspace)
+ fprintf(stderr,"ERROR memory is already corrupted in %s at %d.\n", __FILE__, __LINE__);
+ assert(tracep < allocatedspace);
+ }
+
+ return(TraceBuffer);
+}
diff --git a/atac-driver/chainer/localalign/GF_ALN_overlap.C b/atac-driver/chainer/localalign/GF_ALN_overlap.C
new file mode 100644
index 0000000..69457fe
--- /dev/null
+++ b/atac-driver/chainer/localalign/GF_ALN_overlap.C
@@ -0,0 +1,848 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Author: Clark Mobarry
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+
+#include "GF_ALN_local.H"
+
+#define max(x,y) ((x<y) ? (y):(x))
+#define min(x,y) ((x>y) ? (y):(x))
+
+
+#define MIN_USABLE 3 /* Smallest subpart of a segment usable for chaining */
+
+
+int MIN_ALIGNED_COLS=30; /* minimum length of a local overlap in the following
+ sense: an overlap is defined by a set of segments;
+ each segment has a length (ignoring minor troubles
+ in determining length in the presence of small
+ indels); let the sum of the overlap for this
+ purpose be the sum of the lengths of the segments
+ that make up the overlap */
+
+#define BIG_INT 0x7FFFFFFF
+
+typedef struct {
+ int start;
+ int base;
+ int segment;
+ int best;
+} Candidate;
+
+typedef struct {
+ Local_Segment *item;
+ int isadd;
+} Event;
+
+typedef struct {
+ int value;
+ int source;
+ int start;
+ int colsAligned;
+} TraceElement;
+
+/*** AVL-TREE LIST ROUTINES ***/
+
+static void OutOfMemory(char *where)
+{ fprintf(stderr,"COMPARE_LOCAL: Out of memory (%s)\n",where);
+ exit (1);
+}
+
+typedef struct _AVLnode {
+ int RC, LN;
+ short H;
+ struct _AVLnode *L, *R;
+ Candidate V;
+} AVLnode;
+
+#define CP(v) ((v)->L->LN)
+
+static AVLnode *freept;
+static AVLnode *NIL;
+
+#define INC AVLinc
+#define DEC AVLdec
+#define SEL AVLselect
+#define RNK AVLrank
+#define ADD AVLinsert
+#define DEL AVLdelete
+
+static void AVLinit(void)
+{ freept = NULL;
+ NIL = (AVLnode *) malloc(sizeof(AVLnode));
+ if (NIL == NULL)
+ OutOfMemory("Candidate list");
+ NIL->LN = NIL->RC = 1;
+ NIL->H = 0;
+ NIL->V.base = BIG_INT;
+ NIL->V.best = BIG_INT;
+}
+
+static AVLnode *AVLinc(AVLnode *v)
+{ v->RC++;
+ return (v);
+}
+
+static void AVLdec(AVLnode *v)
+{ v->RC--;
+ if (v->RC == 0)
+ { DEC(v->L);
+ DEC(v->R);
+ v->L = freept;
+ freept = v;
+ }
+}
+
+static int AVLlength(AVLnode *v)
+{ DEC(v);return (v->LN - 1); }
+
+static AVLnode *NEW(AVLnode *l, Candidate *x, AVLnode *r)
+{ AVLnode *v;
+ int b;
+
+ if (freept == NULL)
+ { v = (AVLnode *) malloc(sizeof(AVLnode));
+ if (v == NULL)
+ OutOfMemory("Candidate list");
+ }
+ else
+ { v = freept;
+ freept = v->L;
+ }
+
+ v->RC = 1;
+ v->V = *x;
+ v->L = l;
+ v->R = r;
+ v->LN = l->LN + r->LN;
+ v->H = (l->H < r->H ? r->H : l->H) + 1;
+
+ b = v->V.base;
+ if (v->L->V.best < b)
+ b = v->L->V.best;
+ if (v->R->V.best < b)
+ b = v->R->V.best;
+ v->V.best = b;
+
+ return (v);
+}
+
+static AVLnode *BAL(AVLnode *l, Candidate *x, AVLnode *r)
+{ AVLnode *t;
+ if (l->H - r->H >= -1 && l->H - r->H <= 1)
+ t = NEW(INC(l),x,INC(r));
+ else if (l->H > r->H)
+ if (l->L->H >= l->R->H)
+ t = NEW(INC(l->L),&(l->V),NEW(INC(l->R),x,INC(r)));
+ else
+ t = NEW(NEW(INC(l->L),&(l->V),INC(l->R->L)),&(l->R->V),
+ NEW(INC(l->R->R),x,INC(r)));
+ else
+ if (r->R->H >= r->L->H)
+ t = NEW(NEW(INC(l),x,INC(r->L)),&(r->V),INC(r->R));
+ else
+ t = NEW(NEW(INC(l),x,INC(r->L->L)),&(r->L->V),
+ NEW(INC(r->L->R),&(r->V),INC(r->R)));
+ DEC(l); DEC(r);
+ return (t);
+}
+
+static Candidate *AVLselect(AVLnode *v, int k)
+{ Candidate *x;
+ if (k < CP(v))
+ x = SEL(INC(v->L),k);
+ else if (k > CP(v))
+ x = SEL(INC(v->R),k-CP(v));
+ else
+ x = &(v->V);
+ DEC(v);
+ return (x);
+}
+
+static int AVLrank(AVLnode *v, int pos)
+{ int k;
+ if (v == NIL)
+ k = 0;
+ else if (pos < v->V.start)
+ k = RNK(INC(v->L),pos);
+ else
+ k = CP(v) + RNK(INC(v->R),pos);
+ DEC(v);
+ return (k);
+}
+
+static AVLnode *AVLminprf(AVLnode *v, int hgh, int bst)
+{ AVLnode *r;
+ int b = 0;
+
+ if (v == NIL)
+ r = v;
+ else if (hgh < v->V.start)
+ r = AVLminprf(INC(v->L),hgh,bst);
+ else
+ { if (v->L->V.best < bst)
+ b = v->L->V.best;
+ if (v->V.base < bst)
+ b = v->V.base;
+ r = AVLminprf(INC(v->R),hgh,b);
+ if (r->V.base < bst)
+ bst = r->V.base;
+ if (v->V.base < bst)
+ { r = v; bst = v->V.base; }
+ if (v->L->V.best < bst)
+ r = AVLminprf(INC(v->L),hgh,bst);
+ }
+ DEC(v);
+ return (r);
+}
+
+static AVLnode *AVLminsuf(AVLnode *v, int low, int bst)
+{ AVLnode *r;
+ int b = 0;
+
+ if (v == NIL)
+ r = v;
+ else if (low > v->V.start)
+ r = AVLminsuf(INC(v->R),low,bst);
+ else
+ { if (v->R->V.best < bst)
+ b = v->R->V.best;
+ if (v->V.base < bst)
+ b = v->V.base;
+ r = AVLminsuf(INC(v->L),low,b);
+ if (r->V.base < bst)
+ bst = r->V.base;
+ if (v->V.base < bst)
+ { r = v; bst = v->V.base; }
+ if (v->R->V.best < bst)
+ r = AVLminsuf(INC(v->R),low,bst);
+ }
+ DEC(v);
+ return (r);
+}
+
+static AVLnode *AVLminrng(AVLnode *v, int low, int hgh)
+{ AVLnode *r, *t;
+
+ if (v == NIL)
+ r = v;
+ else if (hgh < v->V.start)
+ r = AVLminrng(INC(v->L),low,hgh);
+ else if (low > v->V.start)
+ r = AVLminrng(INC(v->R),low,hgh);
+ else
+ { r = v;
+ t = AVLminprf(INC(v->R),hgh,r->V.base);
+ if (t->V.base < r->V.base)
+ r = t;
+ t = AVLminsuf(INC(v->L),low,r->V.base);
+ if (t->V.base < r->V.base)
+ r = t;
+ }
+ DEC(v);
+ return (r);
+}
+
+static AVLnode *AVLinsert(AVLnode *v, int k, Candidate *x)
+{ AVLnode *t;
+ if (v == NIL)
+ t = BAL(INC(NIL),x,INC(NIL));
+ else if (k < CP(v))
+ t = BAL(ADD(INC(v->L),k,x),&(v->V),INC(v->R));
+ else
+ t = BAL(INC(v->L),&(v->V),ADD(INC(v->R),k-CP(v),x));
+ DEC(v);
+ return (t);
+}
+
+static AVLnode *AVLdelete(AVLnode *v, int k)
+{ AVLnode *t;
+ if (v->L == NIL && v->R == NIL)
+ t = INC(NIL);
+ else if (k <= CP(v) && v->L != NIL)
+ if (k == CP(v))
+ t = BAL(DEL(INC(v->L),k-1),SEL(INC(v->L),k-1),INC(v->R));
+ else
+ t = BAL(DEL(INC(v->L),k),&(v->V),INC(v->R));
+ else
+ if (k == CP(v))
+ t = BAL(INC(v->L),SEL(INC(v->R),1),DEL(INC(v->R),1));
+ else
+ t = BAL(INC(v->L),&(v->V),DEL(INC(v->R),k-CP(v)));
+ DEC(v);
+ return (t);
+}
+
+
+
+static int SSORT(const void *l, const void *r)
+{ Event *x, *y;
+ int ax, ay, bx, by;
+
+ x = (Event *) l;
+ y = (Event *) r;
+ if (x->isadd)
+ { ax = x->item->abpos;
+ bx = x->item->bbpos;
+ }
+ else
+ { ax = x->item->aepos;
+ bx = x->item->bepos;
+ }
+ if (y->isadd)
+ { ay = y->item->abpos;
+ by = y->item->bbpos;
+ }
+ else
+ { ay = y->item->aepos;
+ by = y->item->bepos;
+ }
+ if (ax < ay)
+ return (-1);
+ else if (ax > ay)
+ return (1);
+ else if (x->isadd != y->isadd)
+ return (x->isadd - y->isadd);
+ else
+ return (bx - by);
+}
+
+
+static void convert_segs(Local_Segment *Segs,int NumSegs,int comp, int Alen,int Blen)
+{ int i; /* Mark and reverse all complemented local segs */
+
+ if (comp)
+ for (i = 0; i < NumSegs; i++)
+ { Segs[i].bbpos = Blen - Segs[i].bbpos;
+ Segs[i].bepos = Blen - Segs[i].bepos;
+ }
+
+ for (i = 0; i < NumSegs; i++)
+ if (Segs[i].bbpos > Segs[i].bepos)
+ { int x;
+ x = Segs[i].bbpos;
+ Segs[i].bbpos = Segs[i].bepos;
+ Segs[i].bepos = x;
+ Segs[i].score = -Segs[i].score-1;
+ }
+}
+
+static void restore_segs(Local_Segment *Segs,int NumSegs,int comp,int Alen,int Blen)
+{ int i; /* Unmark and reverse all complemented local segs */
+
+ for (i = 0; i < NumSegs; i++)
+ if (Segs[i].score < 0)
+ { int x;
+ x = Segs[i].bbpos;
+ Segs[i].bbpos = Segs[i].bepos;
+ Segs[i].bepos = x;
+ Segs[i].score = -Segs[i].score-1;
+ }
+
+ if (comp)
+ { for (i = 0; i < NumSegs; i++)
+ { Segs[i].bbpos = Blen - Segs[i].bbpos;
+ Segs[i].bepos = Blen - Segs[i].bepos;
+ }
+ }
+}
+
+
+Local_Overlap *Find_Local_Overlap(int Alen, int Blen, int comp, int nextbest,
+ Local_Segment *Segs, int NumSegs,
+ int MinorThresh, double GapThresh)
+{ static Candidate Cvals;
+ static int MaxTrace = -1;
+ static TraceElement *Trace = NULL;
+ static Event *EventList;
+ Local_Overlap *Descriptor;
+ Local_Chain *Chain;
+
+ if (NumSegs == 0) return (NULL);
+
+ if (nextbest)
+ { if (Trace == NULL) return (NULL);
+ convert_segs(Segs,NumSegs,comp,Alen,Blen);
+ goto Gen_Overlap;
+ }
+
+ if (MaxTrace < 0)
+ AVLinit();
+
+ if (NumSegs > MaxTrace)
+ { MaxTrace = (int)(1.3*NumSegs) + 500;
+ Trace = (TraceElement *)
+ realloc(Trace,(sizeof(Event)+2*sizeof(TraceElement))*MaxTrace);
+ if (Trace == NULL)
+ OutOfMemory("Overlap Trace Array");
+ EventList = (Event *) (Trace + MaxTrace);
+ { // We have to make sure that EventList is aligned on an appropriate boundary.
+ // It is derived from Trace which has looser alignment constraints.
+ long address = (long)EventList;
+ // By convention "long" int is big as the size of a pointer
+ long offset = (address % sizeof(void *));
+ int pad = sizeof(void *) - offset;
+ // This is how much we need to add to get things aligned.
+ if(offset){
+ // fprintf(stderr,"* Eventlist is %p adding %d up to ", EventList, pad);
+ EventList = (Event *)(((char *)EventList) + pad);
+ // fprintf(stderr," %p\n", EventList);
+ }
+ }
+ }
+
+ convert_segs(Segs,NumSegs,comp,Alen,Blen);
+
+ { int i;
+ for (i = 0; i < NumSegs; i++)
+ { EventList[2*i].item = Segs+i;
+ EventList[2*i].isadd = 1;
+ EventList[2*i+1].item = Segs+i;
+ EventList[2*i+1].isadd = 0;
+ }
+ }
+
+ qsort(EventList,2*NumSegs,sizeof(Event),SSORT);
+
+ { int e;
+ AVLnode *elist, *ilist, *olist;
+
+ elist = AVLinc(NIL);
+ ilist = AVLinc(NIL);
+ olist = AVLinc(NIL);
+
+ for (e = 0; e < 2*NumSegs; e++)
+ { int i, bb, be, ab, ae;
+ double err;
+
+ /* Determine least gapped path to i'th segment */
+
+ i = EventList[e].item - Segs;
+ bb = Segs[i].bbpos;
+ be = Segs[i].bepos;
+ ab = Segs[i].abpos;
+ ae = Segs[i].aepos;
+ err = Segs[i].error;
+
+ if (EventList[e].isadd) /* Segment begins */
+ { int clen, best, srce;
+
+ // this definition of best differs from the original (below)
+ // it is designed to encourage global alignment
+ //
+ best = ab+bb; /* Best from boundary */
+
+ //best = ab; /* Best from boundary */
+ //if (best > bb)
+ // best = bb;
+ //best *= 2;
+
+
+ srce = -1;
+ clen = AVLlength(AVLinc(elist));
+
+ { int p; /* Examine bests from elist */
+
+ p = AVLrank(AVLinc(elist),bb); /* Best @ start of seg */
+ if (p > 0)
+ { Candidate *cand;
+ int altr;
+
+ cand = AVLselect(AVLinc(elist),p);
+ altr = cand->base + (ab + bb);
+ if (altr < best)
+ { best = altr;
+ srce = cand->segment;
+ }
+ }
+
+ while (++p <= clen) /* Bests @ midpoints of seg */
+ { Candidate *cand;
+ int altr;
+
+ cand = AVLselect(AVLinc(elist),p);
+ if (cand->start > be - MIN_USABLE) break;
+ altr = cand->base + 2*cand->start + (ab - bb);
+ if (altr < best)
+ { best = altr;
+ srce = cand->segment;
+ }
+ }
+ }
+
+ /* Examine bests from ilist and olist */
+
+ { AVLnode *m;
+ int bdiag, ldiag, altr;
+
+ bdiag = bb - ab;
+ ldiag = bdiag + ((ae-ab) - MIN_USABLE);
+ m = AVLminprf(AVLinc(ilist),bdiag,BIG_INT);
+ if (m != NIL)
+ { altr = m->V.base + bdiag;
+ if (altr < best)
+ { srce = m->V.segment;
+ best = altr;
+ }
+ }
+ m = AVLminrng(AVLinc(olist),-ldiag,-bdiag);
+ if (m != NIL)
+ { altr = m->V.base - bdiag;
+ if (altr < best)
+ { srce = m->V.segment;
+ best = altr;
+ }
+ }
+ }
+
+ /* Record best linkage for segment */
+
+ Trace[i].value = best;
+ Trace[i].source = srce;
+
+ Trace[i].colsAligned = (int)((1.-err)*(double)(min(ae-ab,be-bb)+1));
+ if (srce >= 0){
+ Trace[i].start = Trace[srce].start;
+ Trace[i].colsAligned += Trace[srce].colsAligned;
+ } else
+ Trace[i].start = i;
+
+ /* Add segment to ilist and olist */
+
+ { int p, d;
+
+ d = be - ae;
+ Cvals.segment = i;
+
+ Cvals.start = d;
+ Cvals.base = best - d;
+ p = AVLrank(AVLinc(ilist),d);
+ ilist = AVLinsert(ilist,p,&Cvals);
+
+ d = -d;
+ Cvals.start = d;
+ Cvals.base = best - d;
+ p = AVLrank(AVLinc(olist),d);
+ olist = AVLinsert(olist,p,&Cvals);
+ }
+ }
+
+ else /* Segment ends */
+ { int best, clen;
+
+ best = Trace[i].value;
+ clen = AVLlength(AVLinc(elist));
+
+ /* Add candidate (if any) created by i'th segment */
+
+ { Candidate *cand;
+ int p, off;
+
+ off = be + Segs[i].aepos;
+ p = AVLrank(AVLinc(elist),be);
+ if (p != 0)
+ cand = AVLselect(AVLinc(elist),p);
+ if (p == 0 || best < cand->base + off)
+ { p += 1;
+ while (p <= clen)
+ { cand = AVLselect(AVLinc(elist),p);
+ if (cand->base + off < best) break;
+ elist = AVLdelete(elist,p);
+ clen -= 1;
+ }
+ p -= 1;
+ if (p > 0)
+ { cand = AVLselect(AVLinc(elist),p);
+ if (cand->start == be)
+ elist = AVLdelete(elist,p--);
+ }
+ Cvals.start = be;
+ Cvals.base = best - off;
+ Cvals.segment = i;
+ elist = AVLinsert(elist,p,&Cvals);
+ }
+ }
+
+ /* Remove candidates from ilist and olist */
+
+ { int p, d;
+
+ d = be-ae;
+ p = AVLrank(AVLinc(ilist),d);
+ while (AVLselect(AVLinc(ilist),p)->segment != i)
+ p -= 1;
+ ilist = AVLdelete(ilist,p);
+
+ p = AVLrank(AVLinc(olist),-d);
+ while (AVLselect(AVLinc(olist),p)->segment != i)
+ p -= 1;
+ olist = AVLdelete(olist,p);
+ }
+ }
+
+ }
+
+ AVLdec(elist);
+ AVLdec(ilist);
+ AVLdec(olist);
+ }
+
+
+
+Gen_Overlap:
+
+ { int i, npiece;
+ int best, end, beg;
+
+ best = BIG_INT; /* Determine best overall overlap */
+ end = -1;
+ for (i = 0; i < NumSegs; i++){
+ // if (Trace[i].start >= 0)
+ if (Trace[i].start >= 0&&Trace[i].colsAligned >= MIN_ALIGNED_COLS) {
+ int sfx;
+
+ // this definition of sfx differs from the original (below)
+ // it is designed to encourage global alignment
+ //
+ sfx = Alen - Segs[i].aepos + Blen - Segs[i].bepos;
+
+ //sfx = Alen - Segs[i].aepos;
+ //if (Blen - Segs[i].bepos < sfx)
+ // sfx = Blen - Segs[i].bepos;
+ //sfx *= 2;
+
+ // The "- 2 * Trace[i].colsAligned" makes us encourage longer alignments
+ //
+ if (Trace[i].value + sfx - 2*Trace[i].colsAligned < best) {
+ best = Trace[i].value - 2*Trace[i].colsAligned + sfx;
+ end = i;
+ }
+ }
+ }
+
+ if (end < 0) {
+ restore_segs(Segs,NumSegs,comp,Alen,Blen);
+ return (NULL);
+ }
+
+ beg = Trace[end].start;
+
+ /* How many segments in the best overlap? */
+
+ npiece = 0;
+ for (i = end; i >= 0; i = Trace[i].source)
+ npiece += 1;
+
+
+
+ /* Allocate result data structures in a single memory block */
+
+ Descriptor = (Local_Overlap *) malloc(sizeof(Local_Overlap) +
+ (npiece+1)*sizeof(Local_Chain));
+ if (Descriptor == NULL)
+ OutOfMemory("Overlap descriptor");
+ Chain = (Local_Chain *) (Descriptor + 1);
+
+ /* Fill out the description of the chain */
+
+ { int n;
+ n = npiece;
+ for (i = end; i >= 0; i = Trace[i].source)
+ Chain[--n].piece = Segs[i];
+ }
+
+
+#define ALLOW_DUP_SEGS_IN_NEXT /* allow all but the first segment to be
+ used in later attempts */
+#ifndef ALLOW_DUP_SEGS_IN_NEXT
+ for (i = 0; i < NumSegs; i++)
+ if (Trace[i].start == beg)
+ Trace[i].start = -1; /* this seems to prevent reuse of segments
+ in subsequent calls, and/or if we reject
+ this segment as too noisy and jump back
+ to the top */
+#else
+ Trace[end].start = -1; /* this seems to prevent reuse of segments
+ in subsequent calls, and/or if we reject
+ this segment as too noisy and jump back
+ to the top */
+ #define REUSE_CURRENT_LAST_AS_NONTERMINAL_SEG
+ #ifndef REUSE_CURRENT_LAST_AS_NONTERMINAL_SEG
+ for (i = 0; i < NumSegs; i++)
+ if (Trace[i].source == beg)
+ Trace[i].source = -1;
+ #endif
+
+#endif
+
+
+ {// The last segment doesn't describe an alignment, only a gap. Initialize it to reasonable values
+ Local_Segment *lastseg = &Chain[npiece].piece;
+ lastseg->abpos = lastseg->bbpos = -1;
+ lastseg->aepos = lastseg->bepos = -1;
+ lastseg->ldiag = lastseg->hdiag = -1;
+ lastseg->score = -1;
+ lastseg->error = -1.0;
+ }
+
+ { int gl;
+ /* there's basically a bug here: abpos = 1 means starts at first char of A;
+ so, agap should be 0, but gets set to 1; i.e., every first gap size
+ gets set to one too many; but there's existing code that relies on
+ this fact, so leave it alone for now */
+ gl = Chain[0].piece.abpos;
+ if (gl > Chain[0].piece.bbpos)
+ gl = Chain[0].piece.bbpos;
+ Chain[0].agap = gl;
+ Chain[0].bgap = gl;
+ }
+
+ for (i = 1; i < npiece; i++)
+ { Chain[i].agap = Chain[i].piece.abpos - Chain[i-1].piece.aepos;
+ Chain[i].bgap = Chain[i].piece.bbpos - Chain[i-1].piece.bepos;
+ }
+
+ { int gl;
+ gl = Alen - Chain[npiece-1].piece.aepos;
+ if (gl > Blen - Chain[npiece-1].piece.bepos)
+ gl = Blen - Chain[npiece-1].piece.bepos;
+ Chain[npiece].agap = gl;
+ Chain[npiece].bgap = gl;
+ }
+
+ for (i = 0; i <= npiece; i++)
+ { if (abs(Chain[i].agap) <= MinorThresh)
+ { if (abs(Chain[i].bgap) <= MinorThresh)
+ { if (Chain[i].agap != 0 || Chain[i].bgap != 0)
+ Chain[i].type = LOCAL_MINOR;
+ else
+ Chain[i].type = LOCAL_BOUNDARY;
+ }
+ else if (Chain[i].bgap < 0)
+ Chain[i].type = LOCAL_REPEAT;
+ else if (Chain[i].bgap > 4*Chain[i].agap)
+ Chain[i].type = LOCAL_INDEL;
+ else
+ Chain[i].type = LOCAL_DISAGREE;
+ }
+ else if (Chain[i].agap < 0)
+ { if (Chain[i].bgap < MinorThresh)
+ Chain[i].type = LOCAL_REPEAT;
+ else
+ Chain[i].type = LOCAL_REPnDEL;
+ }
+ else
+ { if (abs(Chain[i].bgap) < MinorThresh)
+ if (Chain[i].agap > 4*Chain[i].bgap)
+ Chain[i].type = LOCAL_INDEL;
+ else
+ Chain[i].type = LOCAL_DISAGREE;
+ else if (Chain[i].bgap < 0)
+ Chain[i].type = LOCAL_REPnDEL;
+ else
+ Chain[i].type = LOCAL_DISAGREE;
+ }
+ }
+
+ /* Fill out overlap descriptor */
+
+ Descriptor->num_pieces = npiece;
+ Descriptor->score = best;
+ Descriptor->chain = Chain;
+ Descriptor->comp = comp;
+
+ { Local_Segment *sg;
+ int ln;
+
+ Descriptor->indif = 0;
+
+ for (i = 0; i < npiece; i++)
+ { sg = &(Chain[i].piece);
+ ln = ((sg->aepos - sg->abpos) + (sg->bepos - sg->bbpos)) / 2;
+ if (i > 0 && Chain[i-1].piece.error < sg->error)
+ { if (Chain[i].agap < Chain[i].bgap)
+ { if (Chain[i].agap < 0)
+ ln += Chain[i].agap;
+ }
+ else
+ { if (Chain[i].bgap < 0)
+ ln += Chain[i].bgap;
+ }
+ }
+ if (i < npiece-1 && Chain[i+1].piece.error <= sg->error)
+ { if (Chain[i+1].agap < Chain[i+1].bgap)
+ { if (Chain[i+1].agap < 0)
+ ln += Chain[i+1].agap;
+ }
+ else
+ { if (Chain[i+1].bgap < 0)
+ ln += Chain[i+1].bgap;
+ }
+ }
+ if (ln > 0)
+ Descriptor->indif += (int)(ln * sg->error);
+ }
+ }
+
+ Descriptor->diffs = Descriptor->indif;
+
+ for (i = 0; i <= npiece; i++)
+ { int d;
+ if (Chain[i].agap < 0 || Chain[i].bgap < 0)
+ d = abs( (Chain[i].piece.bbpos - Chain[i].piece.abpos) -
+ (Chain[i-1].piece.bepos - Chain[i-1].piece.aepos));
+ else
+ { d = Chain[i].agap;
+ if (d < Chain[i].bgap) d = Chain[i].bgap;
+ }
+ Descriptor->diffs += d;
+ }
+
+ { int overa, overb;
+ overa = (Chain[npiece-1].piece.aepos + Chain[npiece].agap)
+ - (Chain[0].piece.abpos - Chain[0].agap);
+ overb = (Chain[npiece-1].piece.bepos + Chain[npiece].bgap)
+ - (Chain[0].piece.bbpos - Chain[0].bgap);
+ Descriptor->length = (overa + overb) / 2;
+ }
+
+ Descriptor->begpos = Chain[0].piece.abpos - Chain[0].piece.bbpos;
+ Descriptor->endpos = (Blen - Chain[npiece-1].piece.bepos)
+ - (Alen - Chain[npiece-1].piece.aepos);
+
+ for (i = 0; i < npiece; i++)
+ if (Chain[i].piece.score < 0)
+ { int x;
+ x = Chain[i].piece.bbpos;
+ Chain[i].piece.bbpos = Chain[i].piece.bepos;
+ Chain[i].piece.bepos = x;
+ Chain[i].piece.score = - Chain[i].piece.score-1;
+ Chain[i].reversed = 1;
+ }
+ else
+ Chain[i].reversed = 0;
+ }
+
+ restore_segs(Segs,NumSegs,comp,Alen,Blen); /* undo comp and rc changes */
+
+ return (Descriptor);
+}
diff --git a/atac-driver/chainer/localalign/GF_ALN_pieceOlap.C b/atac-driver/chainer/localalign/GF_ALN_pieceOlap.C
new file mode 100644
index 0000000..29abb53
--- /dev/null
+++ b/atac-driver/chainer/localalign/GF_ALN_pieceOlap.C
@@ -0,0 +1,529 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Author: Clark Mobarry
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "GF_ALN_local.H"
+
+
+typedef struct {
+ char *aseg;
+ char *bseg;
+} PAIRALIGN;
+
+
+
+// safely copy a substring of a string into static space which is
+// enlarged as needed
+//
+static
+int
+safe_substr(char **seg,
+ int *segspace,
+ const char *seq,
+ int beg,
+ int end){
+
+ if(*segspace<end-beg+1){
+ *segspace=2*(end-beg)+1;
+ *seg=(char*)ckrealloc(*seg,sizeof(char)*(*segspace));
+ }
+ strncpy(*seg,seq+beg,end-beg);
+ (*seg)[end-beg]='\0';
+ return(strlen(*seg) - (end-beg) == 0);
+}
+
+
+
+/* construct a trace (with AS_ALN_OKNAlign) for the first local segment,
+ copying the result into its own static location (so that we
+ can call AS_ALN_OKNAlign on the second local segment without losing the
+ result */
+
+static int *get_trace(const char *aseq, const char *bseq,Local_Overlap *O,int piece,
+ int which){
+ static char *aseg=NULL, *bseg=NULL;
+ static int asegspace=0,bsegspace=0;
+ static int *segtrace[2], tracespace[2]={0,0};
+ int alen = 0;
+ int blen = 0;
+ int spnt=0, *tmptrace=0L;
+ int segdiff=0;
+ int i=0;
+ int iret=0;
+
+ assert(which==0||which==1);
+
+ if(segtrace[which]==NULL){
+ tracespace[which]=100;
+ segtrace[which]=(int*)ckalloc(sizeof(int)*tracespace[which]);
+ }
+
+ iret = safe_substr(&aseg,&asegspace,aseq,O->chain[piece].piece.abpos,
+ O->chain[piece].piece.aepos);
+ if(iret == 0){
+ fprintf(stderr,"EXCEPTION get_trace: For aseg: len(aseg)=%d, len(bseg)=%d, alen=%d, blen=%d\n",
+ (int)strlen(aseg),(int)strlen(bseg), alen,blen);
+ return NULL;
+ }
+ iret = safe_substr(&bseg,&bsegspace,bseq,O->chain[piece].piece.bbpos,
+ O->chain[piece].piece.bepos);
+ if(iret == 0){
+ fprintf(stderr,"EXCEPTION get_trace: For bseg: len(aseg)=%d, len(bseg)=%d, alen=%d, blen=%d\n",
+ (int)strlen(aseg),(int)strlen(bseg), alen,blen);
+ return NULL;
+ }
+
+ alen=O->chain[piece].piece.aepos-O->chain[piece].piece.abpos;
+ blen=O->chain[piece].piece.bepos-O->chain[piece].piece.bbpos;
+
+ //printf("get_trace: len(aseg)=%d, len(bseg)=%d, alen=%d, blen=%d\n",
+ // strlen(aseg),strlen(bseg), alen,blen);
+
+ /* get trace for the segment from AS_ALN_OKNAlign */
+ spnt=0;
+ /* subtract because Gene likes to point to one before string start */
+ aseg--;
+ bseg--;
+ segdiff=(int)((O->chain[piece].piece.aepos-O->chain[piece].piece.abpos)
+ *(1.5*O->chain[piece].piece.error) +10);
+
+ tmptrace=AS_ALN_OKNAlign(aseg,alen,bseg,blen,&spnt,segdiff);
+
+ if(spnt!=0){
+ if(spnt>0){
+ O->chain[piece].agap+=spnt;
+ O->chain[piece].piece.abpos+=spnt;
+ i=0;
+ while(tmptrace[i]!=0){
+ if(tmptrace[i]<0){
+ tmptrace[i]+=spnt;
+ }
+ i++;
+ }
+ } else {
+ O->chain[piece].bgap+=-spnt;
+ O->chain[piece].piece.bbpos+=-spnt;
+ i=0;
+ while(tmptrace[i]!=0){
+ if(tmptrace[i]>0){
+ tmptrace[i]+=spnt;
+ }
+ i++;
+ }
+ }
+ }
+ aseg++; /* restore because need to know where memory block is allocated,
+ and so that next time around strncpy will work right! */
+ bseg++;
+ i=0;
+ while(tmptrace[i]!=0){
+ segtrace[which][i]=tmptrace[i];
+ i++;
+ if(i==tracespace[which]){
+ tracespace[which]*=2;
+ segtrace[which]=(int*)ckrealloc(segtrace[which],
+ sizeof(int)*tracespace[which]);
+ }
+ }
+ segtrace[which][i]=0;
+ return(segtrace[which]);
+
+}
+
+
+
+
+
+
+
+static void safe_add_to_seg(char **seg,int pos,char c,int *len){
+ if(pos==*len){
+ (*len)=(*len)*2;
+ *seg=(char*)ckrealloc(*seg,sizeof(char)*((*len)+1));
+ }
+ (*seg)[pos]=c;
+}
+
+
+
+static PAIRALIGN *construct_pair_align(const char *aseq,
+ const char *bseq,
+ Local_Overlap *O,
+ int piece,
+ int *trace,
+ int which){
+ static char *aseg[2]={NULL,NULL},*bseg[2]={NULL,NULL};
+ static int alen[2]={0,0},blen[2]={0,0};
+ static PAIRALIGN pairalign[2];
+
+ int starta,startb;
+ int offseta,offsetb;
+ int tpos,apos,bpos;
+
+
+ if(aseg[which]==NULL){
+ alen[which]=blen[which]=1000;
+ aseg[which]=(char*)ckalloc((alen[which]+1)*sizeof(char));
+ bseg[which]=(char*)ckalloc((blen[which]+1)*sizeof(char));
+ }
+ starta=offseta=O->chain[piece].piece.abpos;
+ startb=offsetb=O->chain[piece].piece.bbpos;
+ tpos=0;
+ apos=0;
+ bpos=0;
+
+ while(trace[tpos]!=0){
+ if(trace[tpos]<0){
+ for(;offseta<-trace[tpos]+starta-1;apos++,offseta++){
+ safe_add_to_seg(&(aseg[which]),apos,aseq[offseta],&(alen[which]));
+ }
+ safe_add_to_seg(&(aseg[which]),apos,'-',&(alen[which]));
+ apos++;
+ } else {
+ for(;offsetb<trace[tpos]+startb-1;bpos++,offsetb++){
+ safe_add_to_seg(&(bseg[which]),bpos,bseq[offsetb],&(blen[which]));
+ }
+ safe_add_to_seg(&(bseg[which]),bpos,'-',&(blen[which]));
+ bpos++;
+ }
+ tpos++;
+ }
+ for(;offseta<O->chain[piece].piece.aepos;apos++,offseta++){
+ safe_add_to_seg(&(aseg[which]),apos,aseq[offseta],&(alen[which]));
+ }
+ for(;offsetb<O->chain[piece].piece.bepos;bpos++,offsetb++){
+ safe_add_to_seg(&(bseg[which]),bpos,bseq[offsetb],&(blen[which]));
+ }
+
+ assert(offseta==O->chain[piece].piece.aepos);
+ assert(offsetb==O->chain[piece].piece.bepos);
+ assert(offseta-O->chain[piece].piece.abpos+
+ offsetb-O->chain[piece].piece.bbpos+
+ tpos ==
+ apos+bpos);
+ safe_add_to_seg(&(aseg[which]),apos,'\0',&(alen[which]));
+ safe_add_to_seg(&(bseg[which]),bpos,'\0',&(blen[which]));
+
+ pairalign[which].aseg=aseg[which];
+ pairalign[which].bseg=bseg[which];
+ return(pairalign+which);
+}
+
+
+static PAIRALIGN *get_align(const char *aseq,const char *bseq,Local_Overlap *O,int piece,
+ int which){
+ int *trace=get_trace(aseq,bseq,O,piece,which);
+ if(trace == NULL) return NULL;
+
+ PAIRALIGN *pairalign = construct_pair_align(aseq,bseq,O,piece,trace,which);
+
+ return(pairalign);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+void fix_overlapping_pieces(const char *aseq, const char *bseq,
+ Local_Overlap *O,int piece0, int piece1){
+
+ PAIRALIGN *pair_align1,*pair_align2;
+
+ int offseta1,offsetb1,offseta2,offsetb2;
+ int bestend1a,bestend1b,bestbeg2a,bestbeg2b;
+ int into1,into2,bestinto2=0;
+ int errs1,errs2,minerrs;
+
+
+ assert(O->chain[piece0].piece.aepos>=O->chain[piece1].piece.abpos||
+ O->chain[piece0].piece.bepos>=O->chain[piece1].piece.bbpos);
+
+ assert(O->chain[piece0].piece.aepos<=O->chain[piece1].piece.aepos);
+ assert(O->chain[piece0].piece.bepos<=O->chain[piece1].piece.bepos);
+
+ /* create alignments for the two segments */
+
+ pair_align1=get_align(aseq,bseq,O,piece0,0);
+ pair_align2=get_align(aseq,bseq,O,piece1,1);
+
+ if(pair_align1 == NULL || pair_align2 == NULL){
+ fprintf(stderr,"EXCEPTION pair_align1=%p pair_align2=%p\n", pair_align1, pair_align2);
+ fprintf(stderr,"EXCEPTION while fixing gap(%d,%d) (%d,%d)---(%d,%d) vs. gap(%d,%d) (%d,%d)---(%d,%d)\n",
+ O->chain[piece0].agap, O->chain[piece0].bgap,
+ O->chain[piece0].piece.abpos,O->chain[piece0].piece.bbpos,
+ O->chain[piece0].piece.aepos,O->chain[piece0].piece.bepos,
+ O->chain[piece1].agap, O->chain[piece1].bgap,
+ O->chain[piece1].piece.abpos,O->chain[piece1].piece.bbpos,
+ O->chain[piece1].piece.aepos,O->chain[piece1].piece.bepos);
+ }
+
+ if(pair_align1 == NULL){
+ fprintf(stderr,"EXCEPTION Fixing by pseudo-deleting piece0.\n");
+ O->chain[piece0].agap=0;
+ O->chain[piece0].bgap=0;
+ if(piece0>0){
+ O->chain[piece0].piece.abpos=O->chain[piece0-1].piece.aepos;
+ O->chain[piece0].piece.aepos=O->chain[piece0-1].piece.aepos;
+ O->chain[piece0].piece.bbpos=O->chain[piece0-1].piece.bepos;
+ O->chain[piece0].piece.bepos=O->chain[piece0-1].piece.bepos;
+ } else {
+ O->chain[piece0].piece.abpos=0;
+ O->chain[piece0].piece.aepos=0;
+ O->chain[piece0].piece.bbpos=0;
+ O->chain[piece0].piece.bepos=0;
+ }
+ O->chain[piece1].agap = O->chain[piece1].piece.abpos - O->chain[piece0].piece.aepos;
+ O->chain[piece1].bgap = O->chain[piece1].piece.bbpos - O->chain[piece0].piece.bepos;
+ return;
+ }
+ if(pair_align2 == NULL){
+ fprintf(stderr,"EXCEPTION Fixing by pseudo-deleting piece1.\n");
+ O->chain[piece1].agap=0;
+ O->chain[piece1].bgap=0;
+ O->chain[piece1].piece.abpos = O->chain[piece0].piece.aepos;
+ O->chain[piece1].piece.aepos = O->chain[piece0].piece.aepos;
+ O->chain[piece1].piece.bbpos = O->chain[piece0].piece.bepos;
+ O->chain[piece1].piece.bepos = O->chain[piece0].piece.bepos;
+
+ if(piece1+1<=O->num_pieces){
+ O->chain[piece1+1].agap=O->chain[piece1+1].piece.abpos - O->chain[piece0].piece.aepos;
+ O->chain[piece1+1].bgap=O->chain[piece1+1].piece.bbpos - O->chain[piece0].piece.bepos;
+ }
+ return;
+ }
+
+
+ /* if, in finding the alignments, we shift the ends of the
+ alignment of the first segment to after the starts of the
+ alignment of the second segment, then the overlap has been
+ resolved, so we do nothing more */
+
+ if(!(O->chain[piece0].piece.aepos>=O->chain[piece1].piece.abpos||
+ O->chain[piece0].piece.bepos>=O->chain[piece1].piece.bbpos)){
+
+ return;
+ }
+
+ /* if, in finding the alignments, we shift the end of the
+ alignment of the second segment to before the start of the
+ alignment of the first segment, then the second is contained
+ in the first and we need to do something exceptional;
+ the most heuristic, but consistent with the practice elsewhere
+ in the local overlapper, is to pseudo-delete the second segment */
+
+ if(!(O->chain[piece0].piece.aepos<=O->chain[piece1].piece.aepos)||
+ !(O->chain[piece0].piece.bepos<=O->chain[piece1].piece.bepos)){
+
+ O->chain[piece1].agap=0;
+ O->chain[piece1].bgap=0;
+ O->chain[piece1].piece.abpos=O->chain[piece0].piece.aepos;
+ O->chain[piece1].piece.aepos=O->chain[piece0].piece.aepos;
+ O->chain[piece1].piece.bbpos=O->chain[piece0].piece.bepos;
+ O->chain[piece1].piece.bepos=O->chain[piece0].piece.bepos;
+
+ if(piece1+1<=O->num_pieces){
+ O->chain[piece1+1].agap=O->chain[piece1+1].piece.abpos-
+ O->chain[piece0].piece.aepos;
+ O->chain[piece1+1].bgap=O->chain[piece1+1].piece.bbpos-
+ O->chain[piece0].piece.bepos;
+ }
+ return;
+ }
+
+
+ /* if, in finding the alignments, we shift the start of the
+ alignment of the first segment to after the start of the
+ alignment of the second segment, then the first is contained
+ in the second and we need to do something exceptional;
+ the most heuristic, but consistent with the practice elsewhere
+ in the local overlapper, is to pseudo-delete the first segment */
+
+ if(O->chain[piece0].piece.abpos>O->chain[piece1].piece.abpos||
+ O->chain[piece0].piece.bbpos>O->chain[piece1].piece.bbpos){
+
+ O->chain[piece0].agap=0;
+ O->chain[piece0].bgap=0;
+ if(piece0>0){
+ O->chain[piece0].piece.abpos=O->chain[piece0-1].piece.aepos;
+ O->chain[piece0].piece.aepos=O->chain[piece0-1].piece.aepos;
+ O->chain[piece0].piece.bbpos=O->chain[piece0-1].piece.bepos;
+ O->chain[piece0].piece.bepos=O->chain[piece0-1].piece.bepos;
+ } else {
+ O->chain[piece0].piece.abpos=0;
+ O->chain[piece0].piece.aepos=0;
+ O->chain[piece0].piece.bbpos=0;
+ O->chain[piece0].piece.bepos=0;
+ }
+ O->chain[piece1].agap=O->chain[piece1].piece.abpos-
+ O->chain[piece0].piece.aepos;
+ O->chain[piece1].bgap=O->chain[piece1].piece.bbpos-
+ O->chain[piece0].piece.bepos;
+
+ return;
+ }
+
+ /* find start of region for evaluation in first alignment */
+ /* when done,
+ offseta1 and offsetb1 should be the offsets into the sequences
+ such that they correspond to a column in the alignment of the
+ first segment and that column contains the first possible
+ overlap with the second segment */
+
+ offseta1=O->chain[piece0].piece.abpos;
+ offsetb1=O->chain[piece0].piece.bbpos;
+ into1=0;
+ while(offseta1<O->chain[piece1].piece.abpos&&
+ offsetb1<O->chain[piece1].piece.bbpos){
+ assert(pair_align1->aseg[into1]!='\0');
+ assert(pair_align1->bseg[into1]!='\0');
+ if(pair_align1->aseg[into1]!='-')offseta1++;
+ if(pair_align1->bseg[into1]!='-')offsetb1++;
+ into1++;
+ }
+
+
+ // if(pair_align1->aseg[into1-1]!='-')offseta1--;
+ // if(pair_align1->bseg[into1-1]!='-')offsetb1--;
+
+ /* count mismatches in the second alignment */
+
+ into2=0;
+ errs2=0;
+ while(pair_align2->aseg[into2]!='\0'){
+ assert(pair_align2->bseg[into2]!='\0');
+ if(pair_align2->aseg[into2]!=pair_align2->bseg[into2]){
+ errs2++;
+ }
+ into2++;
+ }
+
+ /* initialize solution variables and auxiliaries */
+ into2=0;
+ errs1 = (pair_align1->aseg[into1]!=pair_align1->bseg[into1] ? 1 : 0);
+ minerrs=errs2;
+ offseta2=O->chain[piece1].piece.abpos;
+ offsetb2=O->chain[piece1].piece.bbpos;
+ bestend1a=offseta1 - (pair_align1->aseg[into1-1]!='-' ? 1 : 0);
+ bestend1b=offsetb1 - (pair_align1->bseg[into1-1]!='-' ? 1 : 0);
+ bestbeg2a=offseta2;
+ bestbeg2b=offsetb2;
+
+ /* while there is potential overlap still to come ... */
+
+ while(pair_align1->aseg[into1]!='\0'&&pair_align2->aseg[into2]!='\0'){
+
+ // Once, we did the following assert, assuming that the alignment
+ // of pair_align2 would not run out before pair_align1, since otherwise
+ // there would be a containment or some such that shouldn't happen;
+ // But, as luck would have it, alignment trimming quirks etc can
+ // make it happen. So ... no more assert
+ //
+ // assert(pair_align2->aseg[into2]!='\0');
+
+ /* while a position in the second segment is no greater than
+ the position in the first segment,
+ check for mismatch in second segment,
+ counting errors,
+ incrementing the sequence position counters as appropriate;
+ advance the second segment
+ position */
+
+ while(offseta1>=offseta2||offsetb1>=offsetb2){
+ errs2-= (pair_align2->aseg[into2]!=pair_align2->bseg[into2] ? 1 : 0);
+ offseta2+= ( pair_align2->aseg[into2]!='-' ? 1 : 0 );
+ offsetb2+= ( pair_align2->bseg[into2]!='-' ? 1 : 0 );
+ into2++;
+ if(pair_align2->aseg[into2]=='\0'){
+ break;
+ }
+ // assert(pair_align2->aseg[into2]!='\0');
+ // assert(pair_align2->bseg[into2]!='\0');
+ }
+
+ if(errs1+errs2<=minerrs&&
+ pair_align1->aseg[into1]==pair_align1->bseg[into1]){
+ minerrs=errs1+errs2;
+ bestend1a=offseta1 /* -(pair_align1->aseg[into1-1]!='-' ? 1 : 0 )*/;
+ bestend1b=offsetb1 /* -(pair_align1->bseg[into1-1]!='-' ? 1 : 0 )*/;
+ bestbeg2a=offseta2;
+ bestbeg2b=offsetb2;
+ bestinto2=into2;
+ }
+
+ /* while the positions in the first segment are no greater than
+ the positions in the second segment,
+ check for mismatch in first segment,
+ counting errors,
+ incrementing the sequence position counters as appropriate;
+ advance the first segment
+ position */
+
+ while(offseta1<offseta2&&offsetb1<offsetb2){
+ offseta1+= ( pair_align1->aseg[into1]!='-' ? 1 : 0 );
+ offsetb1+= ( pair_align1->bseg[into1]!='-' ? 1 : 0 );
+ into1++;
+ errs1+= (pair_align1->aseg[into1]!=pair_align1->bseg[into1] ? 1 : 0);
+
+ if(pair_align1->aseg[into1]=='\0'){
+ break;
+ }
+ }
+ }
+
+ if(bestend1a<O->chain[piece0].piece.aepos)
+ bestend1a++;
+ if(bestend1b<O->chain[piece0].piece.bepos)
+ bestend1b++;
+ O->chain[piece0].piece.aepos=bestend1a;
+ O->chain[piece0].piece.bepos=bestend1b;
+ O->chain[piece1].piece.abpos=bestbeg2a;
+ O->chain[piece1].piece.bbpos=bestbeg2b;
+ O->chain[piece1].agap=bestbeg2a-bestend1a;
+ O->chain[piece1].bgap=bestbeg2b-bestend1b;
+
+ assert(O->chain[piece1].agap>=0);
+ assert(O->chain[piece1].bgap>=0);
+ assert(O->chain[piece1].agap==0||O->chain[piece1].bgap==0);
+
+
+ // now, adjust the beginning of the second piece to skip any mismatches
+ while(pair_align2->aseg[bestinto2]!=pair_align2->bseg[bestinto2]&&
+ pair_align2->aseg[bestinto2]!='\0'){
+ bestbeg2a += ( pair_align2->aseg[bestinto2]!='-' ? 1 : 0 );
+ bestbeg2b += ( pair_align2->bseg[bestinto2]!='-' ? 1 : 0 );
+ bestinto2++;
+ }
+ O->chain[piece1].piece.abpos=bestbeg2a;
+ O->chain[piece1].piece.bbpos=bestbeg2b;
+ O->chain[piece1].agap=bestbeg2a-bestend1a;
+ O->chain[piece1].bgap=bestbeg2b-bestend1b;
+
+ assert(O->chain[piece1].piece.abpos<=O->chain[piece1].piece.aepos);
+ assert(O->chain[piece1].piece.bbpos<=O->chain[piece1].piece.bepos);
+}
diff --git a/atac-driver/chainer/localalign/localAlignerInterfacemodule.C b/atac-driver/chainer/localalign/localAlignerInterfacemodule.C
new file mode 100644
index 0000000..ffdde01
--- /dev/null
+++ b/atac-driver/chainer/localalign/localAlignerInterfacemodule.C
@@ -0,0 +1,234 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Author: Clark Mobarry
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include <Python.h>
+
+#include "GF_ALN_local.H"
+
+static Local_Overlap *desc = NULL;
+
+
+// This should be in the "library" not in the client. Sigh.
+
+void
+syntenicSegments(char const * const Aseq, int const Astart, int const Astop,
+ char const * const Bseq, int const Bstart, int const Bstop,
+ double const erate) {
+
+ desc = NULL; // In case an early exit happens!
+
+ // Key data types ("Local_Segment" and "Local_Overlap") are defined
+ // in "CA_ALN_local.H"
+
+ assert(Astop >= Astart);
+ assert(Bstop >= Bstart);
+
+ // Step 1: get local segments:
+ char const * const Ausable = Aseq + Astart;
+ char const * const Busable = Bseq + Bstart;
+
+ int const alen = Astop - Astart;
+ int const blen = Bstop - Bstart;
+
+ int NumSegs = 0; /* number of local matches returned */
+
+ Local_Segment *local_results = Find_Local_Segments(Ausable, /* sequence A */
+ alen,
+ Busable,
+ blen,
+ LOCAL_FORW, /* whether to compute a forward search , reverse, or both */
+ 16, /* minimum length of a reportable match */
+ erate, /* maximum error for a match to be returned */
+ &NumSegs); /* number of local matches returned */
+
+ if(NumSegs==0)
+ return;
+
+ // Step 2: get a chain of local segments:
+
+ Local_Overlap *Ov = Find_Local_Overlap(alen, /* length of sequence A */
+ blen, /* length of sequence B */
+ 0, /* comp==0 -> fwd orientation */
+ 0, /* nextbest==0 -> find best overlap*/
+ local_results, /* the input set of local segments */
+ NumSegs, /* number of input local segments */
+ 20 - 6, /* shortest "overlap" to report" */
+ 1.0); /* fraction of overlap not in a match -- needs to be large to allow substantial mismatches */
+
+ if(Ov == NULL)
+ return;
+
+ // Step 3 (optional):
+ //
+ // NOT optional! AS_Local_Trace seems to have been extended to
+ // clean up segments.
+ //
+ // a) fix the chain of segments so that the segments don't overlap.
+ // It must be a 1-1 mapping. (can either trim or delete segments--or
+ // leave them completely alone)
+ //
+ // b) construct an alignment "trace"
+ //
+ // The "trace" is the standard "AS" encoding of an alignment.
+
+ // coordinate munge between Gene's local aligner and
+ // DP_Compare()-related routines coordinates from Find_Local
+ // routines will be one off from those expected by the trace
+ // routines, so adjust them!
+
+ for(int i=0;i<=Ov->num_pieces;i++){
+ if(i<Ov->num_pieces){
+ Ov->chain[i].piece.abpos++;
+ Ov->chain[i].piece.bbpos++;
+ Ov->chain[i].piece.aepos++;
+ Ov->chain[i].piece.bepos++;
+ }
+ }
+
+ // AS_Local_Trace assumes string pointer one before start of string!
+ //
+ // The original used to complain if no trace was returned, but we
+ // don't care...and, in fact, we disabled trace generation anyway!
+ //
+ AS_Local_Trace(Ov, Ausable - 1, Busable - 1);
+
+ for(int i=0;i<=Ov->num_pieces;i++){
+ if(i<Ov->num_pieces){
+ Ov->chain[i].piece.abpos--;
+ Ov->chain[i].piece.bbpos--;
+ Ov->chain[i].piece.aepos--;
+ Ov->chain[i].piece.bepos--;
+ }
+ }
+
+ Ov->next = 0;
+
+ desc = Ov;
+}
+
+
+
+
+int iterate_Local_Overlap(int &seg_abpos, int &seg_bbpos,
+ int &seg_alen, int &seg_blen,
+ double &seg_error) {
+
+ if (desc == NULL)
+ return(0);
+
+ Local_Chain *chain = desc->chain;
+
+ assert(NULL != desc->chain);
+
+ for(; 0 <= desc->next && desc->next < desc->num_pieces; ) {
+ int the_piece = (desc->next)++;
+
+ Local_Segment *seg = &(chain[the_piece].piece);
+
+ assert(NULL != seg);
+ assert(!chain[the_piece].reversed);
+
+ // Set the return data
+
+ seg_abpos = seg->abpos;
+ seg_alen = seg->aepos - seg->abpos;
+ seg_bbpos = seg->bbpos;
+ seg_blen = seg->bepos - seg->bbpos;
+ seg_error = seg->error;
+
+ // Skip over the "deleted in-place" segments.
+ if((seg->aepos <= seg->abpos)&&(seg->bepos <= seg->bbpos))
+ continue;
+
+ // the data is valid
+ return(1);
+ }
+
+ // Nothing left.
+ return(0);
+}
+
+
+
+
+static PyObject *
+spam_syntenicSegments(PyObject *self, PyObject *args) {
+ char *Aseq = "undefined";
+ int Astart = -1;
+ int Astop = -1; // substring of Aseq
+ char *Bseq = "undefined";
+ int Bstart = -1;
+ int Bstop = -1; // substring of Bseq
+ double erate = 1.0 / 3.0;
+
+ PyObject *py_outfile = NULL;
+
+ if (!PyArg_ParseTuple(args, "Osiisiid", &py_outfile, &Aseq, &Astart, &Astop, &Bseq, &Bstart, &Bstop, &erate))
+ return NULL;
+
+ try {
+ syntenicSegments(Aseq, Astart, Astop, // substring of Aseq
+ Bseq, Bstart, Bstop, // substring of Bseq
+ erate);
+ } catch (...) {
+ PyErr_SetString(PyExc_RuntimeError,"sytenicSegments failed");
+ return(Py_None);
+ }
+
+ Py_INCREF(Py_None); // This is a module function returning void.
+ return(Py_None);
+}
+
+
+
+static PyObject *
+spam_iterateSegments(PyObject *self, PyObject *args) {
+ int seg_bgn1 = 0;
+ int seg_bgn2 = 0;
+ int seg_len1 = 0;
+ int seg_len2 = 0;
+ double seg_error = 0.0;
+
+ if (iterate_Local_Overlap(seg_bgn1, seg_bgn2, seg_len1, seg_len2, seg_error))
+ return(Py_BuildValue("(iiiid)", seg_bgn1, seg_bgn2, seg_len1, seg_len2, seg_error));
+
+ Py_INCREF(Py_None); // This is a module function returning void.
+ return(Py_None);
+}
+
+
+
+static
+PyMethodDef
+registration_table[] = {
+ {"syntenicSegments", spam_syntenicSegments, METH_VARARGS, "Compute syntenic segments"},
+ {"iterateSegments", spam_iterateSegments, METH_VARARGS, "Iterator returning syntenic segments"},
+ {NULL, NULL, 0, NULL}
+};
+
+
+extern "C"
+void initlocalAlignerInterface() {
+ Py_InitModule("localAlignerInterface", registration_table);
+}
+
diff --git a/atac-driver/chainer/python/AtacDriver.py b/atac-driver/chainer/python/AtacDriver.py
new file mode 100755
index 0000000..5a7bc14
--- /dev/null
+++ b/atac-driver/chainer/python/AtacDriver.py
@@ -0,0 +1,602 @@
+#!/usr/bin/env python
+
+"""
+The environmental variable "PYTHONPATH" is a colon separated list
+of directories of imported Python modules (*.py) and C/C++ shared
+libraries (*.so for Unix or *.dll for Windows).
+
+Written by Clark Mobarry, Applied Biosystems, 2002-2004.
+"""
+
+"""
+Known issues:
+(1) I need to remove from parameters from the output: /inpname=, /outname=,.
+(4) The checkpointing scheme assumes that a previous existing checkpoint file is GOOD.
+"""
+
+import os, sys, time, getopt, tempfile
+import MyFile
+import MatchRecord
+import AtacFile
+import IdxStore
+import UniqueFilter
+import PerfectRuns
+import TrimMatchOverlaps
+import squeezeIntraRunGaps
+import localAlignerInterface
+import fillIntraRunGaps
+
+#import dedashMatches
+
+STDERR=sys.stderr
+STDOUT=sys.stdout
+
+def die(message):
+ print >>STDERR, message
+ os.exit(1)
+
+def cvm(f,x,y):
+ # A cvm variant (flag ? y : x) = (x,y)[f]
+ if f :
+ return x
+ else:
+ return y
+ # end if
+# end def
+
+
+
+class GlobalParam:
+ def __init__(self,line):
+ pass
+ def __str__ (self):
+ return "/%s=%s" % (self._key,self._value)
+ def get(self):
+ return (self._key,self._value)
+ def put(self,key,value):
+ (self._key,self._value) = (key,value)
+
+
+def usage (*_):
+ print >>STDERR, "Usage: atacdriver.py matchFilePrefix"
+# end def
+
+def filterByMatchLength( inpfile, outfile, minimum_length):
+ "Only keep matches that are long enough."
+ inpfile.seek(0)
+ for line in inpfile:
+ if(line[0] == 'M'):
+ FM = MatchRecord.MatchRecord(line)
+ if (FM.x_length >= minimum_length and
+ FM.y_length >= minimum_length ):
+ print >>outfile, FM
+ # end if
+ # end if
+ # end for
+# end def
+
+
+def onlyKeepLongRuns ( inpfile, outname, lengthThreshold ):
+ outfile = MyFile.myfile()
+ rejectsfile = MyFile.myfile()
+
+ FL = None
+ store = []
+ lenInMatches = 0
+ inpfile.seek(0)
+ for line in inpfile:
+ if(line[0] == 'M'):
+ FM = MatchRecord.MatchRecord(line)
+ SL = FM.x_length
+ if FL != None and FL.runid != FM.runid :
+ for x in store:
+ print >>rejectsfile, x
+ # end for
+ store = []
+ lenInMatches = SL
+ else:
+ lenInMatches += SL
+ # end if
+
+ if lenInMatches < lengthThreshold:
+ store.append(FM)
+ else:
+ for x in store:
+ print >>outfile, x
+ # end for
+ store = []
+ print >>outfile, FM
+ # end if
+ FL = FM
+ # end if
+ # end for
+ rejectsfile.close()
+ return outfile
+# end def
+
+def coalesceMatches ( inpfile, outfile, needs_to_share_diagonal ):
+ "Coalesce overlapping and abutting matches within the same run."
+
+ firstF = None
+ lastF = None
+
+ lastLX = -3
+ lastLY = -4
+ lastForward = 0
+
+ lowHitPX = None
+ lowHitPY = None
+ hghHitPX = None
+ hghHitPY = None
+ inpfile.seek(0)
+ outfile.seek(0)
+ for line in inpfile:
+ if(line[0] == 'M'):
+ curF = MatchRecord.MatchRecord(line)
+ px = curF.x_start
+ nx = curF.x_length
+ py = curF.y_start
+ ny = curF.y_length
+ assert(px >= 0)
+ assert(nx >= 0)
+ assert(py >= 0)
+ assert(ny >= 0)
+ if (not (not needs_to_share_diagonal or nx == ny)):
+ print >>STDERR, 'Bombed on:'
+ print >>STDERR, str(curF)
+ print >>STDERR, 'needs_to_share_diagonal=' + str(needs_to_share_diagonal)
+ print >>STDERR, 'nx=' + str(nx) + ' ny=' + str(ny)
+ # end if
+ assert((hghHitPX == None or (not needs_to_share_diagonal) or nx == ny))
+ forward = (curF.x_orientation == curF.y_orientation)
+ lx = px
+ ly = cvm( forward, py, py + ny)
+ rx = px + nx
+ ry = cvm( forward, py + ny, py)
+
+ overlapping = ((lastF != None) and
+ (curF.x_scaf_uid == lastF.x_scaf_uid) and
+ (curF.y_scaf_uid == lastF.y_scaf_uid) and
+ (((lx >= lowHitPX and lx <= hghHitPX) and
+ (ly >= lowHitPY and ly <= hghHitPY)) or
+ ((rx >= lowHitPX and rx <= hghHitPX) and
+ (ry >= lowHitPY and ry <= hghHitPY))))
+ on_diagonal = ((forward == lastForward) and
+ ((lx - lastLX) == ((ly - lastLY) * cvm(forward, 1, -1))))
+ # print >>STDOUT, lastF, curF
+ # print >>STDOUT, lx,rx,ly,ry
+ # print >>STDOUT, lowHitPX,hghHitPX,lowHitPY,hghHitPY
+ # print >>STDOUT, "overlapping=",overlapping
+ # print >>STDOUT, "on_diagonal=",on_diagonal
+
+ lowMerPX = px
+ lowMerPY = py
+ hghMerPX = px + nx
+ hghMerPY = py + ny
+ if (not (overlapping and (not needs_to_share_diagonal or on_diagonal))):
+ if (firstF != None):
+ # if (lastF == None or firstF.runid != lastF.runid):
+ # end if
+ firstF.subtype = ('g','u')[needs_to_share_diagonal]
+ firstF.x_start = lowHitPX
+ firstF.y_start = lowHitPY
+ firstF.x_length = hghHitPX - lowHitPX
+ firstF.y_length = hghHitPY - lowHitPY
+ print >>outfile, firstF
+ # end if
+ firstF = curF
+ lowHitPX = lowMerPX
+ lowHitPY = lowMerPY
+ hghHitPX = hghMerPX
+ hghHitPY = hghMerPY
+ # end if
+ lowHitPX = cvm(lowHitPX < lowMerPX, lowHitPX, lowMerPX)
+ lowHitPY = cvm(lowHitPY < lowMerPY, lowHitPY, lowMerPY)
+ hghHitPX = cvm(hghHitPX > hghMerPX, hghHitPX, hghMerPX)
+ hghHitPY = cvm(hghHitPY > hghMerPY, hghHitPY, hghMerPY)
+
+ lastLX = lx
+ lastLY = ly
+ lastForward = forward
+ lastF = curF
+ # end if
+ # end for
+
+
+ if (firstF != None):
+ firstF.subtype = ('g','u')[needs_to_share_diagonal]
+ firstF.x_start = lowHitPX
+ firstF.y_start = lowHitPY
+ firstF.x_length = hghHitPX - lowHitPX
+ firstF.y_length = hghHitPY - lowHitPY
+ print >>outfile, firstF
+
+ return
+# end def
+
+
+
+# Note that if record has an initial rank for the X and Y sorting,
+# then re-sorting and box recovery are simplified.
+
+# Resorting becomes making the inital sparse ranking dense then a
+# scattering to the destination.
+
+# NOTE THAT outname is unused here.
+
+def boxRecovery( inpfile, rawfile, outname):
+ inpfile.seek(0)
+ rawfile.seek(0)
+ outfile = MyFile.myfile()
+
+ rawfileIter = iter(rawfile)
+
+ # This is a modified merge operation?
+ # The two input files must be sorted the same manner.
+ leftMatch = None
+ for line in inpfile:
+ if(line[0] == 'M'):
+ rightMatch = MatchRecord.MatchRecord(line)
+ if( leftMatch != None and leftMatch.inSameRunAs(rightMatch) ):
+ # print >>STDERR, "In same run leftMatch=", leftMatch, " rightMatch=", rightMatch
+ for rawline in rawfileIter:
+ if( rawline[0] == 'M'):
+ rawMatch = MatchRecord.MatchRecord(rawline)
+ if(rawMatch.sameAs(rightMatch)):
+ print >>outfile, rightMatch
+ break
+ else:
+ # print "Inside run rawMatch=", rawMatch
+ if(rawMatch.isInsideBox(leftMatch,rightMatch)):
+ print >>outfile, rawMatch
+ # end if
+ # end if
+ # end if
+ # end for
+ # We should die here if there is no rawMatch that matched the rightMatch ...
+ else:
+ # print >>STDERR, "Between runs leftMatch=", leftMatch, " rightMatch=", rightMatch
+ for rawline in rawfileIter:
+ if( rawline[0] == 'M'):
+ rawMatch = MatchRecord.MatchRecord(rawline)
+ if(rawMatch.sameAs(rightMatch)):
+ print >>outfile, rightMatch
+ break
+ else:
+ # print >>STDERR, "Discard rawMatch=", rawMatch
+ pass
+ # end if
+ # end if
+ # end for
+ # We should die here if there is no rawMatch that matched the rightMatch ...
+ # Discard raw Matches until it is ge to the right match.
+ # end if
+ leftMatch = rightMatch
+ # end if
+ # end for
+ return outfile
+# end def
+
+
+class AtacDriver(AtacFile.AtacFile):
+ def runOld(self):
+ self.globals['atacAlgorithmVersion'] = str(17)
+ print >>STDERR, "runName = %s\n" % self.runName
+
+ # The ATAC globals used by this script:
+ opt_t = int(self.globals['globalMatchMinSize'])
+ opt_l = int(self.globals['globalPerfectRunMinLen'])
+ maxdiff = int(self.globals['globalPerfectRunMaxGapLen'])
+
+ assemblyId1 = self.globals['assemblyId1']
+ assemblyId2 = self.globals['assemblyId2']
+
+ assemblyFile1 = self.globals['assemblyFile1']
+ assemblyFile2 = self.globals['assemblyFile2']
+
+ boxRecoveryOn = 0 # Deprecated for same species comparisons 2003/09/09.
+ if(self.globals.has_key("boxRecoveryOn")):
+ boxRecoveryOn = int(self.globals['boxRecoveryOn'])
+
+ t0 = time.time()
+
+ assemblyIdx1 = IdxStore.IdxStore(assemblyFile1,assemblyId1)
+ assemblyIdx2 = IdxStore.IdxStore(assemblyFile2,assemblyId2)
+ rawfile = None
+
+ ###################################################################
+ # Setup for checkpointing scheme.
+ redo = 0
+ keep = 0
+ step = 0
+ if(self.globals.has_key("ckpKeep")):
+ keep = int(self.globals['ckpKeep'])
+ ckpName = "AllDone"
+ ###################################################################
+
+ print >>STDERR, 'Keep step=' + str(keep)
+ print >>STDERR, 'At step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+
+ outprefix = self.runName
+
+ step += 1
+ print >>STDERR, 'At uniqueFilter, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ if(not(self.globals.has_key('uniqueFilterOn') and self.globals['uniqueFilterOn']=="0")):
+ print >>STDERR, 'Running UniqueFilter'
+ outfile = MyFile.myfile()
+ UniqueFilter.main( self.matches, outfile)
+ self.matches = outfile
+ outprefix += '.uniq'
+ self.checkpoint(outprefix)
+
+ step += 1
+ print >>STDERR, 'At filterByMatchLength, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ print >>STDERR, 'Running filterByMatchLength'
+ outfile = MyFile.myfile()
+ filterByMatchLength( self.matches, outfile, opt_t)
+ self.matches = outfile
+ outprefix += '.t' + str(opt_t)
+ self.checkpoint(outprefix)
+
+ step += 1
+ print >>STDERR, 'At trimMatchOverlaps, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ print >>STDERR, "Start trimming for bp one-to-one-ness"
+ tempdata = MyFile.myfile()
+ TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
+ self.matches = tempdata
+ print >>STDERR, "Finished trimming for bp one-to-one-ness"
+ outprefix += '.trim'
+ self.checkpoint(outprefix)
+
+ if( boxRecoveryOn == 1 ):
+ # For box recovery later ... but what if we start from a checkpoint?
+ rawfile = self.matches
+
+ step += 1
+ print >>STDERR, 'At formPerfectRuns, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.p6'
+ tempdata = PerfectRuns.formPerfectRuns(self.matches,
+ MatchRecord.sortInXorderAP,
+ MatchRecord.sortInYorderAP,
+ maxdiff,
+ 'r')
+ self.matches = tempdata
+ outprefix += ".p6"
+ # end if
+
+ step += 1
+ print >>STDERR, 'At onlyKeepLongRuns, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.l' + str(opt_l)
+ tempdata = onlyKeepLongRuns( self.matches, outprefix, opt_l)
+ self.matches = tempdata
+ outprefix += '.l' + str(opt_l)
+ self.checkpoint(outprefix)
+
+ step += 1
+ print >>STDERR, 'At formPerfectRuns, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ print >>STDERR, 'Heal the perfect runs'
+ tempdata = PerfectRuns.formPerfectRuns(self.matches,
+ MatchRecord.sortInYorderAP,
+ MatchRecord.sortInXorderAP, maxdiff, 'r')
+ self.matches = tempdata
+ outprefix += '.pr'
+ self.checkpoint(outprefix)
+
+ if(boxRecoveryOn == 1):
+
+ # This is a box recovery step.
+ step += 1
+ print >>STDERR, 'At boxRecovery, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.br'
+ print >>STDERR, "Make sorted raw matches"
+ outfile = MyFile.myfile()
+ MatchRecord.sortInXorderAP( rawfile, outfile)
+ rawfile = outfile
+ print >>STDERR, "perform box recovery"
+ tempdata = boxRecovery( self.matches, rawfile, outprefix)
+ self.matches = tempdata
+ outprefix += '.br'
+ self.checkpoint(outprefix)
+ # end if
+
+ step += 1
+ print >>STDERR, 'At formPerfectRuns, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ( (keep < step) and not self.globals.has_key(ckpName))):
+ print >>STDERR, "form perfect runs"
+ redo = 1
+ print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.p6'
+ tempdata = PerfectRuns.formPerfectRuns(self.matches,
+ MatchRecord.sortInXorderAP,
+ MatchRecord.sortInYorderAP, maxdiff, 'r')
+ self.matches = tempdata
+ outprefix += '.pr'
+ self.checkpoint(outprefix)
+
+ step += 1
+ print >>STDERR, 'At squeezeIntraRunGaps, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.sq'
+ tempdata = MyFile.myfile()
+ squeezeIntraRunGaps.mainLoop(
+ self.matches,
+ tempdata,
+ assemblyIdx1, assemblyIdx2)
+ tempy = MyFile.myfile()
+ # Beware the current match subtypes are 'x', 'L', and 'R'!
+ coalesceMatches( tempdata, tempy, 1)
+ self.matches = tempy
+ outprefix += '.sq'
+ self.checkpoint(outprefix)
+
+ step += 1
+ print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ print >>STDERR, "Start trimming for bp one-to-one-ness"
+ tempdata = MyFile.myfile()
+ TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
+ self.matches = tempdata
+ outprefix += '.trim'
+ print >>STDERR, "Finished trimming for bp one-to-one-ness"
+
+ step += 1
+ print >>STDERR, 'At RunsAsMatches, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ self.runs = PerfectRuns.runsAsMatches( self.matches)
+ outprefix += '.runs'
+ self.checkpoint(outprefix)
+ # end if
+
+ if(self.globals.has_key('fillIntraRunGapsOn') and self.globals['fillIntraRunGapsOn']=="1" ):
+
+ # Next comes the DNA sequence dependent stuff.
+ step += 1
+ print >>STDERR, 'At fillIntraRunGaps, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ print >>STDERR, "fill the intrarun gaps"
+ if(not self.globals.has_key('fillIntraRunGapsErate')):
+ self.globals['fillIntraRunGapsErate'] = 0.10
+ if(not self.globals.has_key('fillIntraRunGapsMaxGap')):
+ self.globals['fillIntraRunGapsMaxGap'] = 100000
+ fillIntraRunGapsErate = float(self.globals['fillIntraRunGapsErate'])
+ fillIntraRunGapsMaxGap = int(self.globals['fillIntraRunGapsMaxGap'])
+ tempdata = MyFile.myfile()
+ fillIntraRunGaps.mainLoop(self.matches, tempdata,
+ assemblyIdx1, assemblyIdx2,
+ fillIntraRunGapsMaxGap, fillIntraRunGapsErate)
+ self.matches = tempdata
+ outprefix += '.fill'
+ self.checkpoint(outprefix)
+
+ step += 1
+ print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step)
+ print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
+ if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
+ redo = 1
+ print >>STDERR, "trim the overlaps"
+ tempdata = MyFile.myfile()
+ TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
+ self.matches = tempdata
+ outprefix += '.trim'
+ self.checkpoint(outprefix)
+
+ # end if "fillIntraRunGapsOn"
+
+ # end def
+
+# end class
+
+
+class localExecutable :
+ def __init__(self, name):
+ self.name = name
+ def run(self,argline,inpfile,outfile):
+ cmd = "%s %s %s %s" % (self.name,argline,inpfile,outfile)
+ print >>STDERR,"cmd =", cmd
+ iret = os.system(cmd)
+ assert(iret == 0)
+
+
+def main(runName):
+
+ t0 = time.time()
+ obj = AtacDriver(runName)
+
+ t1 = time.time()
+ print >>STDERR, "Read checkpoint in %d seconds." % (t1-t0)
+ t0=t1
+
+ # The following are required:
+ assert(obj.globals.has_key('assemblyId1'))
+ assert(obj.globals.has_key('assemblyId2'))
+ assert(obj.globals.has_key('assemblyFile1'))
+ assert(obj.globals.has_key('assemblyFile2'))
+
+ assemblyId1 = obj.globals['assemblyId1']
+ assemblyId2 = obj.globals['assemblyId2']
+
+ assemblyFile1 = obj.globals["assemblyFile1"]
+ assemblyFile2 = obj.globals["assemblyFile2"]
+
+ assert(os.path.exists(assemblyFile1))
+ assert(os.path.exists(assemblyFile2))
+
+ if(not os.path.exists(assemblyFile1+".idxStore")):
+ IdxStore.createIndexedFasta( assemblyFile1, assemblyId2)
+
+ if(not os.path.exists(assemblyFile2+".idxStore")):
+ IdxStore.createIndexedFasta( assemblyFile2, assemblyId2)
+
+ assert(os.path.exists(assemblyFile1+".idxStore"))
+ assert(os.path.exists(assemblyFile2+".idxStore"))
+
+ if not obj.globals.has_key('matchesFile'):
+ print >>STDERR, "We need to make the raw matches."
+
+ if(not obj.globals.has_key('rawMatchMerSize')):
+ obj.globals['rawMatchMerSize'] = 20
+ if(not obj.globals.has_key('rawMatchMerMaxDegeneracy')):
+ obj.globals['rawMatchMerMaxDegeneracy'] = 1
+ if(not obj.globals.has_key('rawMatchMinFillSize')):
+ obj.globals['rawMatchMinSize'] = obj.globals['rawMatchMerSize']
+
+ # Many 2*rawMatchMerSize-1 matches are due to isolated single
+ # nucleotide mutations in otherwise perfect repeats.
+
+ if(not obj.globals.has_key('globalMatchMinSize')):
+ obj.globals['globalMatchMinSize'] = 2*int(obj.globals['rawMatchMerSize'])
+ if(not obj.globals.has_key('globalPerfectRunMinLen')):
+ obj.globals['globalPerfectRunMinLen'] = 100
+ if(not obj.globals.has_key('globalPerfectRunMaxGapLen')):
+ obj.globals['globalPerfectRunMaxGapLen'] = 100000
+
+ if(not obj.globals.has_key('intraRunGapIsolatedMismatchLen')):
+ obj.globals['intraRunGapIsolatedMismatchLen'] = 20
+
+ obj.runOld()
+
+ t1 = time.time()
+ print >>STDERR, "Ran in %d seconds." % (t1-t0)
+
+ obj.checkpoint(runName + ".chained.atac")
+
+
+
+if __name__ == '__main__':
+ if (len(sys.argv) == 1):
+ print >>sys.stderr, "usage: $sys.argv[0] file.atac"
+ sys.exit(1)
+ if (sys.argv[1] == "justtestingifitworks"):
+ sys.exit(0)
+ main(sys.argv[1])
+
diff --git a/atac-driver/chainer/python/AtacDriver.txt b/atac-driver/chainer/python/AtacDriver.txt
new file mode 100644
index 0000000..3c174f8
--- /dev/null
+++ b/atac-driver/chainer/python/AtacDriver.txt
@@ -0,0 +1,217 @@
+ SET GLOBALS, if not already set, and if using samespecies, parameter set 1
+ obj.globals["heavyChainsOn"] = "1"
+ obj.globals["matchExtenderOn"] = "1"
+ obj.globals["uniqueFilterOn"] = "1"
+ obj.globals["fillIntraRunGapsOn"] = "1"
+ obj.globals["numsegments"] = "1"
+
+ RUN BRIATAC HERE
+
+ MAKE SURE MATCHES ARE IN ATAC FORMAT
+
+ RUN HEAVYCHAINS, if enabled
+ -g /assemblyId1=XXXX
+ -g /assemblyId2=XXXX
+ -g /heavyMaxJump=XXXX (100000)
+ -g /heavyMinFill=XXXX (100)
+
+ RUN GLOBAL CHAINING, if enabled
+ # /work/assembly/floreald/ASM/src/Ross/chain-global
+ # /work/assembly/floreald/ASM/src/break-chains
+
+ All %s's are the prefix
+
+ chain-global %s -M 30 -p DP > %s.M30.dp 2> %s.M30.dp.errs"
+ break-chains %s.M30.dp -D 0 -M 10 -p DPR | grep -v 'M r ' > %s.M30.dp.runs 2> %s.M30.dp.runs.errs"
+
+ RUN CHAIN CONSERVATION(?), if enabled
+ # /work/assembly/floreald/ASM/src/Ross/chain-consv
+ # /work/assembly/floreald/ASM/src/break-chains
+
+ chain-consv %s -p CS > %s.cons 2> %s.cons.errs" % (inpname,inpname,inpname)
+ break-chains %s.cons -diffrun -D 0 -M 10 -p TMP > %s.cons.runs.tmp 2> %s.cons.runs.tmp.errs" % (inpname,inpname,inpname)
+ break-chains %s.cons.runs.tmp -D 0 -M 10 -p CSR | grep -v 'M r ' > %s.cons.runs 2> %s.cons.runs.errs" % (inpname,inpname,inpname)
+
+ RUN CHAIN GREEDY, if enabled
+ # /work/assembly/floreald/ASM/src/Ross/chain-greedy
+ # /work/assembly/floreald/ASM/src/break_chains
+
+ chain-greedy %s -p GR -M 10 -W 500 > %s.greedy 2> %s.greedy.errs" % (inpname, inpname, inpname)
+ break-chains %s.greedy -D 0 -M 10 -p GRR | grep -v 'M r ' > %s.greedy.runs 2> %s.greedy.runs.errs" % (inpname, inpname, inpname)
+
+ RUN MATCH EXTENDER, if enabled
+ matchextender inpname outname
+
+ SET SOME DEFAULTS (unless already set)
+ obj.globals['rawMatchMerSize'] = 20
+ obj.globals['rawMatchMerMaxDegeneracy'] = 1
+ obj.globals['rawMatchMinSize'] = obj.globals['rawMatchMerSize']
+
+ # Many 2*rawMatchMerSize-1 matches are due to isolated single
+ # nucleotide mutations in otherwise perfect repeats.
+ #
+ obj.globals['globalMatchMinSize'] = 2*int(obj.globals['rawMatchMerSize'])
+
+ obj.globals['globalPerfectRunMinLen'] = 100
+ obj.globals['globalPerfectRunMaxGapLen'] = 100000
+ obj.globals['intraRunGapIsolatedMismatchLen'] = 20
+
+ RUN OLD
+ self.globals['atacAlgorithmVersion'] = str(17)
+ print >>STDERR, "runName = %s\n" % self.runName
+
+ # The ATAC globals used by this script:
+ opt_t = int(self.globals['globalMatchMinSize'])
+ opt_l = int(self.globals['globalPerfectRunMinLen'])
+ maxdiff = int(self.globals['globalPerfectRunMaxGapLen'])
+
+ assemblyId1 = self.globals['assemblyId1']
+ assemblyId2 = self.globals["assemblyId2"]
+
+ assemblyFilePrefix1 = self.globals['assemblyFilePrefix1']
+ assemblyFilePrefix2 = self.globals['assemblyFilePrefix2']
+
+ # Deprecated for same species comparisons 2003/09/09.
+ boxRecoveryOn = 0
+ if(self.globals.has_key("boxRecoveryOn")):
+ boxRecoveryOn = int(self.globals["boxRecoveryOn"])
+
+ BUILD IDXSTORE (assemblyIdx1) for the files
+
+ GENERALLY, after each step, the outfile replaces self.matches
+
+ STEP
+ RUN UNIQUE FILTER
+ UniqueFilter.main( self.matches, outfile)
+
+ STEP
+ RUN FILTER BY MATCH LENGTH
+ only keep those M records with both pieces at least as long as opt_t
+
+ STEP
+ RUN TRIMMING for bp one-to-one-ness (rewrite-trimMatches)
+ inpfile = self.matches
+ trim_subtype = 'u'
+
+ gp = MyFile.myfile()
+ MatchRecord.sortInXorderAP(inpfile,gp)
+
+ # The following coalescing assumes perfect runs.
+ # this is the same as rewrite-coalesceMatches
+
+ hp = MyFile.myfile()
+ coalesceMatches( gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')) )
+
+ gp = MyFile.myfile()
+ trimMatchOverlapsInX(hp,gp,trim_subtype)
+
+ hp = MyFile.myfile()
+ MatchRecord.sortInYorderAP(gp,hp)
+
+ trimMatchOverlapsInY(hp,outfile,trim_subtype)
+
+
+ if boxRecoveryOn, save these self.matches (outfile from last step) for later
+
+
+ STEP
+ FORM PERFECT RUNS (rewrite-perfectRuns)
+ tempdata = PerfectRuns.formPerfectRuns(self.matches,
+ MatchRecord.sortInXorderAP,
+ MatchRecord.sortInYorderAP,
+ maxdiff,
+ 'r')
+
+ STEP
+ ONLY KEEP LONG RUNS
+ tempdata = onlyKeepLongRuns( self.matches, outprefix, opt_l)
+
+ description: find all runs (matches with the same run id)
+ that have a sum of lengths larger than opt_l
+
+ details:
+ if there is a last match, and it is a different runid
+ than this match, dump all the saved matches, reset
+ the length to zero
+
+ add in the length of this match to the length
+
+ if the length we've seen so far is
+ less than lengthThreshold, save this match
+
+ otherwise (the length is bigger) print all
+ saved matches, and this match. clear the list
+ of saved matches, but do not clear the length
+
+ remember the runid of this match (call it lastId)
+
+
+ STEP
+ 'HEAL' THE PERFECT RUNS (rewrite-perfectRuns)
+ tempdata = PerfectRuns.formPerfectRuns(self.matches,
+ MatchRecord.sortInYorderAP,
+ MatchRecord.sortInXorderAP, maxdiff, 'r')
+
+ STEP
+ DO BOX RECOVERY, if enabled (rewrite-boxRecovery)
+ print >>STDERR, "Make sorted raw matches"
+ outfile = MyFile.myfile()
+ MatchRecord.sortInXorderAP( rawfile, outfile)
+ rawfile = outfile
+
+ print >>STDERR, "perform box recovery"
+ tempdata = boxRecovery( self.matches, rawfile, outprefix)
+
+ form perfect runs again
+ tempdata = PerfectRuns.formPerfectRuns(self.matches,
+ MatchRecord.sortInXorderAP,
+ MatchRecord.sortInYorderAP, maxdiff, 'r')
+
+ STEP
+ SQUEEZE INTRA RUN GAPS
+ squeezeIntraRunGaps.squeezeIntraRunGaps(
+ self.matches,
+ tempdata,
+ assemblyIdx1, assemblyIdx2)
+
+ tempy = MyFile.myfile()
+
+ # Beware the current match subtypes are 'x', 'L', and 'R'!
+
+ coalesceMatches( tempdata, tempy, 1)
+
+ self.matches = tempy
+
+ STEP
+ TRIMMING FOR bp one-to-one-ness
+ # THIS IS ALSO DONE ABOVE! trimMatchOverlapsInBoth was a metafunction
+ TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
+
+ STEP
+ PERFECT RUNS AS MATCHES (rewrite-runsAsMatches)
+ self.runs = PerfectRuns.runsAsMatches( self.matches)
+
+ STEP
+ FILL INTRA RUN GAPS, if enabled
+
+ set defaults if not set
+ self.globals['fillIntraRunGapsErate'] = 0.10
+ self.globals['fillIntraRunGapsMaxGap'] = 100000
+
+ fillIntraRunGapsErate = float(self.globals['fillIntraRunGapsErate'])
+ fillIntraRunGapsMaxGap = int(self.globals['fillIntraRunGapsMaxGap'])
+
+ fillIntraRunGaps.mainLoop( self.matches, tempdata,
+ assemblyIdx1, assemblyIdx2,
+ fillIntraRunGapsMaxGap, fillIntraRunGapsErate)
+
+ print >>STDERR, "trim the overlaps"
+ TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
+
+ self.matches = tempdata
+
+ STEP
+ COUNT NUMBER OF SUBSTITUTIONS
+ countMisMatches.countMisMatches(self.matches, tempdata, assemblyIdx1, assemblyIdx2)
+
+ALL DONE
diff --git a/atac-driver/chainer/python/AtacFile.py b/atac-driver/chainer/python/AtacFile.py
new file mode 100755
index 0000000..583bd07
--- /dev/null
+++ b/atac-driver/chainer/python/AtacFile.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+# Looking in /usr/local/ir/bin on the Compaqs for the correct Python interpreter.
+# export PYTHONPATH=${PYTHONPATH}:$WORK/cds/IR/COMPASS/src/AtacPipeline
+
+"""
+Extensive documentation for the Python language is available at
+http://www.python.org.
+"""
+
+import os, sys, time, getopt, tempfile
+import MyFile
+import MatchRecord
+
+class AtacFile:
+
+ # The data flow is a pipeline augmented by two read-only indexed
+ # FASTA files.
+
+ def __init__( self, runName):
+ "You must supply a atac file called runName.atac."
+ self.runName = runName
+ self.comments = []
+ self.metacommands = []
+ self.globals = {}
+ self.tableformat = {}
+ self.tabledata = {}
+ self.matches = MyFile.myfile()
+ self.runs = MyFile.myfile()
+
+ fp = open(runName,"r")
+ for line in fp:
+ self.atac_file_parse_line(line)
+
+ def atac_file_parse_line( self, line):
+ line = line.strip()
+ if(not line):
+ return
+ # end if
+ linetype = line[0]
+ if(linetype == '#'):
+ # Just a comment: squirrel away or ignore
+ self.comments.append(line)
+ return
+ elif(linetype == '!'):
+ self.metacommands.append(line)
+ return
+ elif(linetype == '/'):
+ # Add to the globals dictionary
+ (key,value) = line[1:].split('=')
+ self.globals[key] = value.strip()
+ return
+ elif(linetype == '@'):
+ list = line[1:].split()
+ name = list[0]
+ self.tableformat[name] = list[1:]
+ self.tabledata[name] = [] # an empty list
+ return
+ elif(linetype == 'M'):
+ fields = line.split()
+ if(fields[1] == 'r'):
+ print >>self.runs, line
+ else:
+ print >>self.matches, line
+ return
+ elif(line == ''):
+ pass
+ else:
+ print >>sys.stderr, "The offending line:"
+ print >>sys.stderr, line
+ assert(0)
+ # end if
+ # end def
+
+ def checkpoint(self, filename):
+ self.globals["modificationDate"] = time.asctime()
+ fp = open(filename,"w")
+ for line in self.metacommands:
+ print >>fp, line
+ for line in self.comments:
+ print >>fp, line
+ # Output the globals in lexigraphical order.
+ list = []
+ for key in self.globals:
+ list.append("/" + key + "=" + str(self.globals[key]))
+ list.sort()
+ for line in list:
+ print >>fp, line
+ self.matches.seek(0)
+ for line in self.matches:
+ fp.write(line)
+ self.matches.seek(0)
+ self.runs.seek(0)
+ for line in self.runs:
+ fp.write(line)
+ self.runs.seek(0)
+ fp.close()
diff --git a/atac-driver/chainer/python/DNA.py b/atac-driver/chainer/python/DNA.py
new file mode 100644
index 0000000..c78bd4d
--- /dev/null
+++ b/atac-driver/chainer/python/DNA.py
@@ -0,0 +1,52 @@
+class DNA:
+ __doc__ = """Class representing DNA as a string sequence."""
+ basecomplement = {'A':'T', 'C':'G', 'G':'C', 'T':'A',
+ 'a':'t', 'c':'g', 'g':'c', 't':'a',
+ 'M':'K', 'R':'Y', 'W':'W', 'S':'S', 'Y':'R', 'K':'M',
+ 'm':'k', 'r':'y', 'w':'w', 's':'s', 'y':'r', 'k':'m',
+ 'V':'B', 'H':'D', 'D':'H', 'B':'V',
+ 'v':'b', 'h':'d', 'd':'h', 'b':'v',
+ 'N':'N', 'X':'X',
+ 'n':'n', 'x':'x',
+ '-':'-',}
+ # IUB encoding:
+ # M = A/C, R = A/G, W = A/T, S = C/G, Y = C/T, K = G/T,
+ # V = A/C/G, H = A/C/T, D = A/G/T, B = C/G/T, N/X = A/C/G/T,
+ # Celera encoding:
+ # m = -/A/C, r = -/A/G, w = -/A/T, s = -/C/G, y = -/C/T, k = -/G/T,
+ # v = -/A/C/G, h = -/A/C/T, d = -/A/G/T, b = -/C/G/T, n/x = -/A/C/G/T,
+
+ def __init__(self, s):
+ """Create DNA instance initialized to string s."""
+ self.seq = s
+ return
+ def transcribe(self):
+ """Return as RNA string."""
+ return self.seq.replace('T','U')
+ def reverse(self):
+ """Return DNA string in reverse order."""
+ letters = list(self.seq)
+ letters = letters.reverse()
+ return ''.join(letters)
+ def complement(self):
+ """Return the complementary DNA string."""
+ letters = list(self.seq)
+ letters = [self.basecomplement[base] for base in letters]
+ return ''.join(letters)
+ def reversecomplement(self):
+ """Return the reverse complement of the DNA string."""
+ letters = list(self.seq)
+ letters.reverse()
+ letters = [self.basecomplement[base] for base in letters]
+ return ''.join(letters)
+ def gc(self):
+ """Return the portion of DNA composed of G or C."""
+ s = self.seq
+ gc = s.count('G') + s.count('C')
+ return gc * 1. / len(s)
+ def codons(self):
+ """Return list of codons for the DNA string."""
+ s = self.seq
+ end = len(s) - (len(s) % 3) - 1
+ codons = [s[i:i+3] for i in range(0,end,3)]
+ return codons
diff --git a/atac-driver/chainer/python/IdxStore.py b/atac-driver/chainer/python/IdxStore.py
new file mode 100644
index 0000000..8c5b251
--- /dev/null
+++ b/atac-driver/chainer/python/IdxStore.py
@@ -0,0 +1,208 @@
+import os, sys, DNA
+
+#######################################################
+# Begin class methods
+#######################################################
+
+# bpw, 20050312 - .seqStore is exactly a compressed fasta file. Stop
+# building it and assume the input is compressed.
+
+def createIndexedFasta( prefix, nickname):
+
+ # This is a class method (as opposed to an object method).
+ # This method creates an indexed FASTA file on disk.
+
+ print >>sys.stderr, "Creating %s.idxStore" % (prefix)
+
+ the_uid = None
+ defline = None
+ seqline = None
+ linenumber = 0
+ cur_offset = 0
+
+ FASTA = file( prefix, "r")
+ IDXSTORE = file( prefix + ".idxStore", "w")
+ for line in FASTA:
+ linenumber += 1
+ line = line.strip()
+ if(line[0:1] == ">"):
+ # Clear current data to make space for new data.
+ if( the_uid != None ):
+ # The first time thru the_uid is equal to None.
+ assert(defline != None)
+ assert(seqline != None)
+
+ # If we are using a database, then this might be the place to register the data.
+ # uid2defline[the_uid] = defline
+ # uid2seqline[the_uid] = seqline
+
+ def_offset = cur_offset
+ def_length = len(defline)
+ cur_offset += def_length + 1
+ # remember the UNIX newline inserted by print.
+ seq_offset = cur_offset
+ seq_length = len(seqline)
+ cur_offset += seq_length + 1;
+ # remember the UNIX newline inserted by print.
+
+ print >>IDXSTORE, the_uid, def_length, seq_length, def_offset, seq_offset
+ # end if
+
+ # Now process the new data.
+ defline = line
+ the_uid = line.split()[0][1:]
+ seqline = "" # Clear any accumulated sequence.
+ else:
+ seqline += line # Accumulate more DNA sequence
+ # end if
+ # end for
+
+ if(the_uid != None):
+ # Now make sure that accumulated data makes it to disk.
+
+ # If we are using a database, then this might be the place to register the data.
+ # uid2defline[the_uid] = defline
+ # uid2seqline[the_uid] = seqline
+
+ def_offset = cur_offset
+ def_length = len(defline)
+ cur_offset += def_length + 1;
+ # remember the UNIX newline inserted by print.
+ seq_offset = cur_offset
+ seq_length = len(seqline)
+ cur_offset += seq_length + 1;
+ # remember the UNIX newline inserted by print.
+
+ print >>IDXSTORE, the_uid, def_length, seq_length, def_offset, seq_offset
+ # end if
+
+ FASTA.close()
+ IDXSTORE.close()
+ # end if
+# end def
+
+#######################################################
+# End class methods
+#######################################################
+
+class IdxStore:
+ __doc__ = "Class for fast access to multiFASTA files."
+
+ #######################################################
+ # Begin instance methods
+ #######################################################
+
+ def __init__(self,prefix,*optargs):
+ __doc__ = "Create an instance of the class"
+ if(optargs):
+ self.nickname = optargs[0]
+ else:
+ self.nickname = None
+ # end if
+ self.uid2iid = {} # declare an empty mapping
+ self.iid2uid = [] # empty
+ self.iid2def_length = []
+ self.iid2seq_length = []
+ self.iid2def_offset = []
+ self.iid2seq_offset = []
+
+ filename = prefix + ".idxStore"
+ idxstore = file(filename, "r");
+
+ the_iid=0;
+ while 1:
+ line = idxstore.readline();
+ if not line: break
+ # sys.stderr.write("idxline %s\n" % line)
+ cols = line.split()
+ the_uid=cols[0];
+ def_length=int(cols[1]); seq_length=int(cols[2]);
+ def_offset=eval(cols[3]); seq_offset=eval(cols[4])
+ if(self.nickname):
+ self.uid2iid[self.nickname + ':' + str(the_iid)] = the_iid; # hashed
+ self.uid2iid[the_uid] = the_iid; # hashed
+ self.iid2uid.append(the_uid); # vector
+ self.iid2def_length.append(def_length) # vector
+ self.iid2seq_length.append(seq_length)
+ self.iid2def_offset.append(def_offset)
+ self.iid2seq_offset.append(seq_offset)
+ the_iid += 1;
+
+ filename = prefix
+ self.seqstore = file(filename, "r");
+ return
+
+
+ def getStringFromFasta(self, forward, scaf_uid, start, length ):
+ try:
+ scaf_iid = self.uid2iid[scaf_uid]
+ except KeyError:
+ scaf_iid = eval(scaf_uid)
+ if(scaf_iid < 1000000000):
+ # sys.stderr.write("Using scaf_uid as the index\n")
+ pass
+ else:
+ sys.stderr.write("scaf_uid=<%s> is invalid.\n" % scaf_uid)
+ return ""
+
+ seq_length = self.iid2seq_length[scaf_iid];
+ seq_offset = self.iid2seq_offset[scaf_iid];
+ # print >>sys.stderr, "seq_length, seq_offset, start =", seq_length, seq_offset, start
+ # print >>sys.stderr, "seek to offset =", seq_offset+start
+ self.seqstore.seek(seq_offset+start, 0); # from the beginning of file
+ substring = self.seqstore.read(length)
+
+ if(not forward):
+ # sys.stderr.write("Taking reversecomplement\n")
+ try:
+ substring = DNA.DNA(substring).reversecomplement()
+ except KeyError:
+ sys.stderr.write("KeyError in DNA.DNA.reversecomplement()\n")
+ sys.stderr.write("The query %d %s %d %d\n" % (forward,scaf_uid,start,length))
+ sys.stderr.write("%s\n" % substring)
+ #else:
+ #sys.stderr.write("Leave as is\n")
+ return substring
+# end class
+
+def convertIndexToUID ( x_prefix, y_prefix, inpfile, outname, assemblyId1, assemblyId2 ):
+ outfile = myfile()
+
+ DefLines = file(x_prefix, 'r')
+ the_x_uid = {} # Declare an empty dictionary
+ ii = 0
+ for line in DefLines:
+ # A valid idxStore format
+ (ga_uid, sln, cln, sst, cst) = line.split()
+ the_x_uid[assemblyId1+":"+str(ii)] = ga_uid
+ ii += 1
+ # end for
+ DefLines.close()
+
+ DefLines = file(y_prefix, 'r')
+ the_y_uid = {} # Declare an empty dictionary
+ ii = 0
+ for line in DefLine:
+ # A valid idxStore format
+ (ga_uid, sln, cln, sst, cst) = line.split()
+ the_y_uid[assemblyId2+":"+str(ii)] = ga_uid
+ ii += 1
+ # end for
+ DefLines.close()
+
+ inpfile.seek(0)
+ for line in inpfile:
+ if(line[0] == 'M'):
+ FM = MatchRecord.MatchRecord(line)
+ FM.x_scaf_uid = the_x_uid[FM.x_scaf_uid]
+ FM.y_scaf_uid = the_y_uid[FM.y_scaf_uid]
+ print >>outfile, FM
+ # end if
+ # end for
+ outfile.finished()
+ return outfile
+# end def
+
+#if __name__ == '__main__':
+ # main(sys.argv[1],sys.argv[2])
+ #main()
diff --git a/atac-driver/chainer/python/MatchRecord.py b/atac-driver/chainer/python/MatchRecord.py
new file mode 100644
index 0000000..0182323
--- /dev/null
+++ b/atac-driver/chainer/python/MatchRecord.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+
+import sys, os, copy, string, tempfile
+
+class AtacRow:
+ "A general ATAC row object"
+ def __init__(self,line):
+ self.kind = line[0]
+ splitline = line[1:].split(">")
+ self.fixed = splitline[0].split()
+ if(len(splitline)>1):
+ self.defline = splitline[1]
+ else:
+ self.defline = ""
+ # end if
+ # end if
+# end class
+
+class MatchRecord:
+ """Class representing an exact match."""
+
+ #def __init__ (self, *args):
+ #print args
+ # if(args):
+ # (line,) = args
+ # # print " parse line= " + line
+ # self.fromString(line)
+ # # end if
+ # end def
+
+ def sameAs(self,other):
+ return (
+ (self.x_orientation == other.x_orientation) and
+ (self.x_scaf_uid == other.x_scaf_uid) and
+ (self.x_start == other.x_start) and
+ (self.y_orientation == other.y_orientation) and
+ (self.y_scaf_uid == other.y_scaf_uid) and
+ (self.y_start == other.y_start) and
+ (self.x_length == other.x_length) and
+ (self.y_length == other.y_length)
+ )
+ # end def
+
+ def isInsideBox(self, one, two):
+ # We need to modify this because the matches are not points.
+ dxone = self.x_start - one.x_start
+ dxtwo = self.x_start - two.x_start
+ dyone = self.y_start - one.y_start
+ dytwo = self.y_start - two.y_start
+ flag = (
+ # (self.x_orientation == one.x_orientation) and
+ # (self.x_orientation == two.x_orientation) and
+ # (self.y_orientation == one.y_orientation) and
+ # (self.y_orientation == two.y_orientation) and
+ (self.x_scaf_uid == one.x_scaf_uid) and
+ (self.x_scaf_uid == two.x_scaf_uid) and
+ (self.y_scaf_uid == one.y_scaf_uid) and
+ (self.y_scaf_uid == two.y_scaf_uid) and
+ (((dxone > 0) and ( dxtwo < 0)) or ((dxone < 0) and ( dxtwo > 0))) and
+ (((dyone > 0) and ( dytwo < 0)) or ((dyone < 0) and ( dytwo > 0)))
+ )
+ return flag
+ # end def
+
+ def inSameRunAs(self,x):
+ return self.runid == x.runid # same parent
+ # end def
+
+ def copy(self):
+ other = copy.copy(self)
+ return other
+
+ def convertFromAtacMatchFormat(self,line):
+ fields = line.split()
+ if(line[0] == 'M'):
+ self.rowtype = fields[0]
+ self.subtype = fields[1]
+ self.matchid = fields[2]
+ self.runid = fields[3]
+ self.x_scaf_uid = fields[4]
+ self.x_start = int(fields[5])
+ self.x_length = int(fields[6])
+ self.x_orientation = int(fields[7])
+ self.y_scaf_uid = fields[8]
+ self.y_start = int(fields[9])
+ self.y_length = int(fields[10])
+ self.y_orientation = int(fields[11])
+ #self.mismatches = int(fields[12])
+ elif(line[0] == '-'):
+ orientation = (fields[0][1:]=='f')
+ self.rowtype = 'M'
+ self.subtype = 'x'
+ self.matchid = '.' # "BMX"+str(lineCount)
+ self.runid = '.'
+ self.x_scaf_uid = assemblyId1 + ":" + fields[2]
+ self.x_start = int(fields[3])
+ self.x_length = int(fields[4])
+ self.x_orientation = 1
+ self.y_scaf_uid = assemblyId2 + ":" + fields[6]
+ self.y_start = int(fields[7])
+ self.y_length = int(fields[8])
+ self.y_orientation = (-1,1)[orientation] # A cvm variant (flag ? x : y).
+ # end def
+
+ def __init__ (self, line, *args):
+ #print args
+ #if(args):
+ # (line,) = args
+ if '>' in line:
+ (line1, line2) = line.split('>')
+ else:
+ line1 = line
+ line2 = ""
+ try:
+ self.convertFromAtacMatchFormat(line1)
+ except IndexError:
+ sys.stderr.write("MatchRecord-- IndexError: line did not split correctly: %s\n"
+ % line1)
+ raise
+ except ValueError:
+ sys.stderr.write("MatchRecord-- ValueError: line did not unpack correctly: %s\n"
+ % line1)
+ raise
+ self.extend = {}
+ extensions = line2.split('/')
+ self.identifier = extensions[0].strip()
+ for argpair in extensions[1:]:
+ if '=' in argpair:
+ (key,value) = argpair.split('=')
+ self.extend[key] = value.strip()
+ return
+
+ def __str__ (self):
+ extension = " >" + self.identifier
+ for key in self.extend:
+ extension += ' /' + key + '=' + str(self.extend[key])
+ if(len(extension)<3):
+ extension = ""
+ return "%s %s %s %s %s %d %d %d %s %d %d %d %s" % (
+ self.rowtype, self.subtype, self.matchid, self.runid,
+ self.x_scaf_uid, self.x_start, self.x_length, self.x_orientation,
+ self.y_scaf_uid, self.y_start, self.y_length, self.y_orientation,
+ extension
+ )
+ # end def
+
+# end class
+
+
+def convertBrianRecordFormat( inpfile, outfile, assemblyId1, assemblyId2):
+ "Convert the match record format from Brian's to atac format."
+ lineCount = 0
+ for line in inpfile:
+ lineCount += 1
+ if(lineCount % 100000 == 0):
+ print >>sys.stderr, "lineCount=%d" % lineCount
+ FB = line.split()
+ orientation = (FB[0][1:]=='f')
+ FM = MatchRecord("M x . . . 0 0 0 . 0 0 0 0\n")
+ FM.x_orientation = 1
+ FM.matchid = "BMX"+str(lineCount)
+ FM.x_scaf_uid = assemblyId1 + ":" + FB[2]
+ FM.x_start = int(FB[3])
+ FM.x_length = int(FB[4])
+ FM.y_orientation = (-1,1)[orientation] # A cvm variant (flag ? x : y).
+ FM.y_scaf_uid = assemblyId2 + ":" + FB[6]
+ FM.y_start = int(FB[7])
+ FM.y_length = int(FB[8])
+ FM.identifier = ""
+ FM.extend = {}
+ #FM.mismatches = 0
+ print >>outfile, FM
+ # end for
+ print >>sys.stderr, "convertRecordFormat done: lineCount=%d" % lineCount
+ outfile.seek(0)
+ return
+# end def
+
+
+def sortInXorderAP( inpfile, outfile):
+ # (x_scaf_uid, x_start, x_length, y_scaf_uid, y_start, y_length)
+ InXOrderAP = '-k 1,1 -k 2,2 -k 5,5 -k 6n -k 7nr -k 8nr -k 9,9 -k 10n -k 11nr -k 12nr'
+ # Use -u to remove the palindromes.
+ # Use -k 7nr -k 11nr to remove abutting contained matches.
+
+ inpfile.seek(0)
+ outfile.seek(0)
+ inpfile.flush()
+ outfile.flush()
+
+ ierr = os.system("sync;sync;sync")
+ assert(ierr == 0)
+ ierr = os.system("sort -T . %s %s > %s" % (InXOrderAP, inpfile.name, outfile.name));
+ assert(ierr == 0)
+ ierr = os.system("sync;sync;sync")
+ assert(ierr == 0)
+ inpfile.seek(0)
+ outfile.seek(0)
+
+ return
+# end def
+
+def sortInYorderAP( inpfile, outfile):
+ # (x_scaf_uid, x_start, x_length, y_scaf_uid, y_start, y_length)
+ InYOrderAP = '-k 1,1 -k 2,2 -k 9,9 -k 10n -k 11nr -k 12nr -k 5,5 -k 6n -k 7nr -k 8nr'
+ # Use -u to remove the palindromes.
+ # Use -k 7nr -k 11nr to remove abutting contained matches.
+
+ inpfile.seek(0)
+ outfile.seek(0)
+ inpfile.flush()
+ outfile.flush()
+
+ ierr = os.system("sync;sync;sync")
+ assert(ierr == 0)
+ ierr = os.system("sort -T . %s %s > %s" % (InYOrderAP, inpfile.name, outfile.name));
+ assert(ierr == 0)
+ ierr = os.system("sync;sync;sync")
+ assert(ierr == 0)
+ inpfile.seek(0)
+ outfile.seek(0)
+
+ return
+# end def
+
+def sortInXorderPP( inpname, outfile):
+ # (x_win, ywin, x_scaf_uid, y_scaf_uid, x_start, y_start, x_length, y_length)
+ assert(1)
+# end def
+
+def sortInYorderPP( inpname, outfile):
+ # (y_win, x_win, y_scaf_uid, x_scaf_uid, y_start, x_start, y_length, x_length)
+ assert(1)
+# end def
diff --git a/atac-driver/chainer/python/MyFile.py b/atac-driver/chainer/python/MyFile.py
new file mode 100755
index 0000000..680a83a
--- /dev/null
+++ b/atac-driver/chainer/python/MyFile.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+import sys, os, copy, tempfile, cStringIO
+
+# from __future__ import generators # Necessary before Python 2.3
+
+class myfile(file):
+ "A temporary anonymous file"
+ def __init__(self):
+ filename = tempfile.mktemp()
+ #print >>sys.stderr, "myfile: creating " + filename
+ file.__init__(self,filename,"w+")
+ def __del__(self):
+ #print >>sys.stderr, "myfile: deleting " + self.name
+ self.close()
+ os.system("rm -f " + self.name)
+ def link(self,othername):
+ #print >>sys.stderr, "myfile: linking %s to %s" % ( self.name, othername)
+ self.flush()
+ os.system("ln -f %s %s" % (self.name, othername))
+
+class ListLikeFileIter:
+ # See http://www.python.org/peps/pep-0234.html
+ # for file iterators.
+ def __init__(self,filename):
+ self._filename = filename
+ self._fileptr = open(self._filename,"r")
+ self._fileIter = iter(self._fileptr.readline,"")
+ def __del__(self):
+ self._fileptr.close()
+ def next(self):
+ line = self._fileIter.next()
+ if line:
+ return line
+ else:
+ raise StopInteration
+ # end if
+ def __getitem__(self,ii):
+ # For files, the list location ii is ignored.
+ # line = self._fileptr.readline()
+ line = self._fileIter.next()
+ if line:
+ return line
+ else:
+ raise IndexError
+ # end if
+ # end def
+
+class ListLikeFile:
+ # See Mark Lutz, Programming Python, edition 1, page 18 and page 128.
+ def __init__(self):
+ #self._filename = tempfile.mktemp()
+ #self._fileptr = open(self._filename,"w")
+ self._fileptr = cStringIO.StringIO()
+ #self._list = []
+ def __del__(self):
+ self._fileptr.close()
+ #pass
+ def __iter__(self):
+ self._fileptr.flush()
+ return iter(cStringIO.StringIO(self._fileptr.getvalue()))
+ #return iter(self._fileptr)
+ #return ListLikeFileIter(self._filename)
+ return iter(self._list)
+ def write(self,x):
+ self._fileptr.write(x)
+ #self._list.append(x)
+ # end def
+# end class
+
+def tester():
+ x = ListLikeFile()
+ print >>x, 4
+ print >>x, 5
+
+ xi = iter(x)
+ print "test 1i"
+ for i in xi: print i,
+
+ print >>x, 6
+ print >>x, 7
+ print "test 2i"
+ for i in xi: print i,
+
+ xj = iter(x)
+ print "test 3j"
+ for i in xj: print i,
+
+ print >>x, 8
+ print >>x, 9
+
+ print "test 3j"
+ for i in xj: print i,
+
+ print "test 3i"
+ for i in xi: print i,
+
+ xk = iter(x)
+ print "test 3k"
+ for i in xk: print i,
+
+ x = None
+
+if __name__ == '__main__':
+ tester()
diff --git a/atac-driver/chainer/python/PerfectRuns.py b/atac-driver/chainer/python/PerfectRuns.py
new file mode 100755
index 0000000..dfbf416
--- /dev/null
+++ b/atac-driver/chainer/python/PerfectRuns.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python
+# Looking in /usr/local/ir/bin on the Compaqs for the correct Python interpreter.
+# export PYTHONPATH=${PYTHONPATH}:$WORK/cds/IR/COMPASS/src/AtacPipeline
+
+"""
+Extensive documentation for the Python language is available at
+http://www.python.org.
+"""
+
+import sys
+import MyFile
+import MatchRecord
+
+def cvm(f,x,y):
+ # A cvm variant (flag ? x : y) = (y,x)[f]
+ if f :
+ return x
+ else:
+ return y
+ # end if
+# end def
+
+def createSignedEnumeration(inpfile):
+ outfile = MyFile.myfile()
+ p = 1
+ inpfile.seek(0)
+ for line in inpfile:
+ if(line[0] == 'M'):
+ FM = MatchRecord.MatchRecord(line)
+ forwardX = FM.x_orientation
+ forwardY = FM.y_orientation
+ srank = cvm(forwardX == forwardY, p, -p)
+ p += 1
+ FM.extend['srank'] = srank
+ print >>outfile, FM
+ # end if
+ # end while
+ return outfile
+# end def
+
+def findPerfectRuns ( inpfile, maxJump, runIdPrefix ):
+ outfile = MyFile.myfile()
+ left = None
+ runid = 1
+ inpfile.seek(0)
+ for line in inpfile:
+ if(line[0] == 'M'):
+ right = MatchRecord.MatchRecord(line)
+ pr = int(right.extend['srank'])
+ del(right.extend['srank'])
+ if(left != None):
+ maxGapInXandY = 0
+ if(left.x_scaf_uid == right.x_scaf_uid
+ and
+ left.y_scaf_uid == right.y_scaf_uid ):
+ # Find the maximum of the gap in x and y axis.
+
+ x_rs = right.x_start
+ x_re = x_rs + right.x_length
+ x_ls = left.x_start
+ x_le = x_ls + left.x_length
+ assert(x_rs < x_re)
+ assert(x_ls < x_le)
+ # All matches are positive length.
+ x_gapLeftBeforeRight = x_rs - x_le
+ x_gapRightBeforeLeft = x_ls - x_re
+ assert(not(x_gapLeftBeforeRight>0 and x_gapRightBeforeLeft>0))
+ x_gap = max(x_gapLeftBeforeRight,x_gapRightBeforeLeft)
+ # x_gap == 0 is abutting
+ # x_gap < 0 is overlapping
+
+ y_rs = right.y_start
+ y_re = y_rs + right.y_length
+ y_ls = left.y_start
+ y_le = y_ls + left.y_length
+ assert(y_rs < y_re)
+ assert(y_ls < y_le)
+ y_gapLeftBeforeRight = y_rs - y_le
+ y_gapRightBeforeLeft = y_ls - y_re
+ assert(not(y_gapLeftBeforeRight>0 and y_gapRightBeforeLeft>0))
+ y_gap = max(y_gapLeftBeforeRight,y_gapRightBeforeLeft)
+ # y_gap == 0 is abutting
+ # y_gap < 0 is overlapping
+
+ maxGapInXandY = max(x_gap,y_gap)
+
+ if 1:
+ # Check the sorting of the matches.
+
+ sorted_by_x = (x_ls <= x_rs)
+ sorted_by_y = (y_ls <= y_rs)
+
+ if(not(sorted_by_x or sorted_by_y)):
+ print >>sys.stderr, "bad sorting in findPerfectRuns"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ assert(sorted_by_x or sorted_by_y)
+ dovetail_in_x = (x_ls <= x_rs) and (x_le <= x_re)
+ dovetail_in_y = (y_ls <= y_rs) and (y_ls <= y_re)
+ if(sorted_by_x and not(dovetail_in_x)):
+ print >>sys.stderr, "contained in x in findPerfectRuns"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ if(sorted_by_y and not(dovetail_in_y)):
+ print >>sys.stderr, "contained in y in findPerfectRuns"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ # endif
+ if(
+ (left.x_scaf_uid != right.x_scaf_uid) or # check first axis id
+ (left.y_scaf_uid != right.y_scaf_uid) or # check second axis id
+ (maxGapInXandY > maxJump) or
+ (pr != lastpr + 1) # Using the signed rank NOT the run id !!!!
+ ):
+ runid += 1
+ # end if
+ # end if
+ lastpr = pr
+ right.runid = "%s%d" % (runIdPrefix,runid,) # Assign the run id in the same slot as the signed rank.
+ print >>outfile, right
+ left = right
+ # end if
+ # end for
+ return outfile
+# end def
+
+def formPerfectRuns ( inpfile, firstSort, secondSort, maxJump, runIdPrefix ):
+ inpfile.seek(0)
+ step = 0
+ print >>sys.stderr, 'formPerfectRuns step=' + str(step)
+ step += 1
+
+ tmpfile = MyFile.myfile()
+ firstSort( inpfile, tmpfile)
+
+ print >>sys.stderr, 'formPerfectRuns step=' + str(step)
+ step += 1
+ outfile = createSignedEnumeration(tmpfile)
+
+ print >>sys.stderr, 'formPerfectRuns step=' + str(step)
+ step += 1
+ tmpfile = MyFile.myfile()
+ secondSort( outfile, tmpfile)
+
+ print >>sys.stderr, 'formPerfectRuns step=' + str(step)
+ step += 1
+ outfile = findPerfectRuns( tmpfile, maxJump, runIdPrefix)
+
+ return outfile
+# end def
+
+def runsAsMatches(inpfile):
+
+ outfile = MyFile.myfile()
+ lastF = None
+ firstF = None
+ runFill = 0
+ inpfile.seek(0)
+ for line in inpfile:
+ if(line[0] == 'M'):
+ curF = MatchRecord.MatchRecord(line)
+ if ((lastF == None) or (curF.runid != lastF.runid)):
+ if ((lastF != None) and (firstF.x_scaf_uid != lastF.x_scaf_uid)):
+ print >>sys.stderr, firstF
+ print >>sys.stderr, lastF
+ # end if
+ assert((lastF==None) or (firstF.x_scaf_uid == lastF.x_scaf_uid))
+ assert((lastF==None) or (firstF.y_scaf_uid == lastF.y_scaf_uid))
+ if (None != lastF):
+ x1 = firstF.x_start
+ x2 = lastF.x_start
+ startX = cvm(x1 < x2, x1, x2)
+ x1 += firstF.x_length
+ x2 += lastF.x_length
+ endX = cvm(x1 > x2, x1, x2)
+ y1 = firstF.y_start
+ y2 = lastF.y_start
+ startY = cvm( y1 < y2, y1, y2)
+ y1 += firstF.y_length
+ y2 += lastF.y_length
+ endY = cvm(y1 > y2, y1, y2)
+ lastF.subtype = 'r'
+ lastF.matchid = lastF.runid
+ lastF.runid = "." # the agreed NULL value
+ lastF.x_start = startX
+ lastF.y_start = startY
+ lastF.x_length = endX - startX
+ lastF.y_length = endY - startY
+ lastF.runFill = runFill
+ print >>outfile, lastF
+ # end if
+ firstF = curF
+ runFill = 0
+ # end if
+ runFill += curF.x_length
+ lastF = curF
+ # end if
+ # end for
+
+ if (None != lastF):
+ x1 = firstF.x_start
+ x2 = lastF.x_start
+ startX = cvm( x1 < x2, x1, x2)
+ x1 += firstF.x_length
+ x2 += lastF.x_length
+ endX = cvm( x1 > x2, x1, x2)
+ y1 = firstF.y_start
+ y2 = lastF.y_start
+ startY = cvm( y1 < y2, y1, y2)
+ y1 += firstF.y_length
+ y2 += lastF.y_length
+ endY = cvm( y1 > y2, y1, y2)
+ lastF.subtype = 'r'
+ lastF.matchid = lastF.runid
+ lastF.runid = "." # the agreed NULL value
+ lastF.x_start = startX
+ lastF.y_start = startY
+ lastF.x_length = endX - startX
+ lastF.y_length = endY - startY
+ lastF.runFill = runFill
+ print >>outfile, lastF
+ # end if
+ return outfile
+# end def
+
+def main(inpname, outname, maxJump, runIdPrefix):
+
+ print >>sys.stderr, "Beware /tmp!\n"
+
+ inpfile = open(inpname)
+ tempdata1 = formPerfectRuns(inpfile,
+ MatchRecord.sortInXorderAP,
+ MatchRecord.sortInYorderAP,
+ int(maxJump),
+ runIdPrefix
+ )
+ tempdata2 = runsAsMatches( tempdata1)
+
+ # Argh! All our work is done in temporary files in /tmp,
+ # but this wants to create hard links to save the last
+ # result -- the output.
+
+ tempdata1.link("/tmp/"+outname+".matches")
+ tempdata2.link("/tmp/"+outname+".runs")
+
+if __name__ == '__main__':
+ inpname = sys.argv[1]
+ outname = sys.argv[2]
+ maxJump = int(sys.argv[3])
+ runIdPrefix = sys.argv[4]
+
+ # defaults, 100000, r
+
+ main(inpname, outname, maxJump, runIdPrefix)
diff --git a/atac-driver/chainer/python/TrimMatchOverlaps.py b/atac-driver/chainer/python/TrimMatchOverlaps.py
new file mode 100644
index 0000000..b90bb9f
--- /dev/null
+++ b/atac-driver/chainer/python/TrimMatchOverlaps.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python
+
+import sys
+import MyFile
+import MatchRecord
+
+def cvm(f,x,y):
+ # A cvm variant (flag ? x : y) = (x,y)[f]
+ if f :
+ return x
+ else:
+ return y
+ # end if
+# end def
+
+def coalesceMatches ( inpfile, outfile, needs_to_share_diagonal ):
+ "Coalesce overlapping and abutting matches within the same run."
+
+ firstF = None
+ lastF = None
+
+ lastLX = -3
+ lastLY = -4
+ lastForward = 0
+
+ lowHitPX = None
+ lowHitPY = None
+ hghHitPX = None
+ hghHitPY = None
+ inpfile.seek(0)
+ outfile.seek(0)
+ for line in inpfile:
+ if(line[0] == 'M'):
+ curF = MatchRecord.MatchRecord(line)
+ px = curF.x_start
+ nx = curF.x_length
+ py = curF.y_start
+ ny = curF.y_length
+ assert(px >= 0)
+ assert(nx >= 0)
+ assert(py >= 0)
+ assert(ny >= 0)
+ if (not (not needs_to_share_diagonal or nx == ny)):
+ print >>sys.stderr, 'Bombed on:'
+ print >>sys.stderr, str(curF)
+ print >>sys.stderr, 'needs_to_share_diagonal=' + str(needs_to_share_diagonal)
+ print >>sys.stderr, 'nx=' + str(nx) + ' ny=' + str(ny)
+ # end if
+ assert((hghHitPX == None or (not needs_to_share_diagonal) or nx == ny))
+ forward = (curF.x_orientation == curF.y_orientation)
+ lx = px
+ ly = cvm( forward, py, py + ny)
+ rx = px + nx
+ ry = cvm( forward, py + ny, py)
+
+ overlapping = ((lastF != None) and
+ (curF.x_scaf_uid == lastF.x_scaf_uid) and
+ (curF.y_scaf_uid == lastF.y_scaf_uid) and
+ (((lx >= lowHitPX and lx <= hghHitPX) and
+ (ly >= lowHitPY and ly <= hghHitPY)) or
+ ((rx >= lowHitPX and rx <= hghHitPX) and
+ (ry >= lowHitPY and ry <= hghHitPY))))
+ on_diagonal = ((forward == lastForward) and
+ ((lx - lastLX) == ((ly - lastLY) * cvm(forward, 1, -1))))
+ # print >>sys.stdout, lastF, curF
+ # print >>sys.stdout, lx,rx,ly,ry
+ # print >>sys.stdout, lowHitPX,hghHitPX,lowHitPY,hghHitPY
+ # print >>sys.stdout, "overlapping=",overlapping
+ # print >>sys.stdout, "on_diagonal=",on_diagonal
+
+ lowMerPX = px
+ lowMerPY = py
+ hghMerPX = px + nx
+ hghMerPY = py + ny
+ if (not (overlapping and (not needs_to_share_diagonal or on_diagonal))):
+ if (firstF != None):
+ # if (lastF == None or firstF.runid != lastF.runid):
+ # end if
+ firstF.subtype = ('g','u')[needs_to_share_diagonal]
+ firstF.x_start = lowHitPX
+ firstF.y_start = lowHitPY
+ firstF.x_length = hghHitPX - lowHitPX
+ firstF.y_length = hghHitPY - lowHitPY
+ print >>outfile, firstF
+ # end if
+ firstF = curF
+ lowHitPX = lowMerPX
+ lowHitPY = lowMerPY
+ hghHitPX = hghMerPX
+ hghHitPY = hghMerPY
+ # end if
+ lowHitPX = cvm(lowHitPX < lowMerPX, lowHitPX, lowMerPX)
+ lowHitPY = cvm(lowHitPY < lowMerPY, lowHitPY, lowMerPY)
+ hghHitPX = cvm(hghHitPX > hghMerPX, hghHitPX, hghMerPX)
+ hghHitPY = cvm(hghHitPY > hghMerPY, hghHitPY, hghMerPY)
+
+ lastLX = lx
+ lastLY = ly
+ lastForward = forward
+ lastF = curF
+ # end if
+ # end for
+
+
+ if (firstF != None):
+ firstF.subtype = ('g','u')[needs_to_share_diagonal]
+ firstF.x_start = lowHitPX
+ firstF.y_start = lowHitPY
+ firstF.x_length = hghHitPX - lowHitPX
+ firstF.y_length = hghHitPY - lowHitPY
+ print >>outfile, firstF
+
+ return
+# end def
+
+def trimMatchOverlapsInX(inpfile,outfile, trim_subtype):
+ "Trim the match overlaps with respect to the X assembly."
+ overlaps=0
+ abuts=0
+ posgaps=0
+ contained = 0
+ trimmed = 0
+ left = None
+
+ picket = 0
+ # For each genomic axis we scan left to right using this picket
+ # position to annihilating any part of the current match to the
+ # left of this picket.
+
+ inpfile.seek(0)
+ for line in iter(inpfile):
+ if(line[0] == 'M'):
+ right = MatchRecord.MatchRecord(line)
+ if( right.subtype != trim_subtype):
+ print >>outfile, line,
+ continue
+ if( left == None or
+ #left.x_scaf_uid < right.x_scaf_uid):
+ left.x_scaf_uid != right.x_scaf_uid):
+ picket = 0
+ else:
+ assert(left != None)
+ assert(right != None)
+ if(left.x_scaf_uid > right.x_scaf_uid):
+ print >>sys.stderr, "sequence ids out of x sorted order"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ assert(left.subtype == right.subtype)
+ assert(left.x_scaf_uid == right.x_scaf_uid)
+ if(not(left.x_start <= right.x_start)):
+ print >>sys.stderr, "trimMatchOverlapsInX: Woops not sorted anymore!"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ #assert(0)
+
+ thisbgn = right.x_start
+ thisend = right.x_start + right.x_length
+ if(picket < thisend):
+ gaplen = thisbgn - picket
+ if(gaplen > 0):
+ posgaps += 1
+ if(gaplen == 0):
+ abuts += 1
+ if(gaplen < 0):
+ overlaps += 1
+ trimmed -= gaplen
+ right.x_start -= gaplen # modify the match
+ right.x_length += gaplen
+ right.y_length += gaplen
+ if(right.x_orientation == right.y_orientation):
+ right.y_start -= gaplen # modify the match
+ else:
+ # picketed region contains right.
+ #print >>sys.stderr, "trimMatchOverlapsInX: Contained"
+ #print >>sys.stderr, left
+ #print >>sys.stderr, right
+ contained += 1
+ right = None # remove this match
+ if(right != None):
+ print >>outfile, right
+ newpicket = right.x_start + right.x_length
+ assert(picket < newpicket)
+ picket = newpicket
+ left = right
+ else:
+ print >>outfile, line,
+ print >>sys.stderr, "trimMatchOverlapsInX:\n",
+ print >>sys.stderr, "#posgaps, #abuts, #overlaps, #contained, bp_trimmed= %d %d %d %d %d\n" \
+ % (posgaps, abuts, overlaps, contained, trimmed, )
+ return
+
+def trimMatchOverlapsInY(inpfile,outfile, trim_subtype):
+ "Trim the match overlaps with respect to the Y assembly."
+ overlaps=0
+ abuts=0
+ posgaps=0
+ contained = 0
+ trimmed = 0
+ left = None
+
+ picket = 0
+ # For each genomic axis we scan left to right using this picket
+ # position to annihilating any part of the current match to the
+ # left of this picket.
+
+ inpfile.seek(0)
+ for line in iter(inpfile):
+ if(line[0] == 'M'):
+ right = MatchRecord.MatchRecord(line)
+ if( right.subtype != trim_subtype):
+ print >>outfile, line,
+ continue
+ if( left == None or
+ #left.y_scaf_uid < right.y_scaf_uid):
+ left.y_scaf_uid != right.y_scaf_uid):
+ picket = 0
+ else:
+ assert(left != None)
+ assert(right != None)
+ if(left.y_scaf_uid > right.y_scaf_uid):
+ print >>sys.stderr, "sequence ids out of y sorted order"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ assert(left.subtype == right.subtype)
+ assert(left.y_scaf_uid == right.y_scaf_uid)
+ if(not(left.y_start <= right.y_start)):
+ print >>sys.stderr, "trimMatchOverlapsInY: Woops not sorted anymore!"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ #assert(0)
+
+ thisbgn = right.y_start
+ thisend = right.y_start + right.y_length
+ if(picket < thisend):
+ gaplen = thisbgn - picket
+ if(gaplen > 0):
+ posgaps += 1
+ if(gaplen == 0):
+ abuts += 1
+ if(gaplen < 0):
+ overlaps += 1
+ trimmed -= gaplen
+ right.y_start -= gaplen # modify the match
+ right.y_length += gaplen
+ right.x_length += gaplen
+ if(right.x_orientation == right.y_orientation):
+ right.x_start -= gaplen # modify the match
+ else:
+ # picketed region contains right.
+ #print >>sys.stderr, "trimMatchOverlapsInY: Contained"
+ #print >>sys.stderr, left
+ #print >>sys.stderr, right
+ contained += 1
+ right = None # remove this match
+ if(right != None):
+ print >>outfile, right
+ newpicket = right.y_start + right.y_length
+ assert(picket < newpicket)
+ picket = newpicket
+ left = right
+ else:
+ print >>outfile, line,
+ print >>sys.stderr, "trimMatchOverlapsInY:\n",
+ print >>sys.stderr, "#posgaps, #abuts, #overlaps, #contained, bp_trimmed= %d %d %d %d %d\n" \
+ % (posgaps, abuts, overlaps, contained, trimmed, )
+ return
+
+
+def trimMatchOverlapsInBoth(inpfile,outfile,trim_subtype):
+ gp = MyFile.myfile()
+ MatchRecord.sortInXorderAP(inpfile,gp)
+ # The following coalescing assumes perfect runs.
+ hp = MyFile.myfile()
+ coalesceMatches( gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')) )
+ gp = MyFile.myfile()
+ trimMatchOverlapsInX(hp,gp,trim_subtype)
+ hp = MyFile.myfile()
+ MatchRecord.sortInYorderAP(gp,hp)
+ trimMatchOverlapsInY(hp,outfile,trim_subtype)
+ return
+
+def main(inpname, outname, trim_subtype):
+ inpfile = open(inpname)
+ outfile = open(outname,"w")
+ trimMatchOverlapsInBoth(inpfile,outfile,trim_subtype)
+
+if __name__ == '__main__':
+ inpname = sys.argv[1]
+ outname = sys.argv[2]
+ trim_subtype = sys.argv[3]
+ main(inpname, outname, trim_subtype)
diff --git a/atac-driver/chainer/python/UniqueFilter.py b/atac-driver/chainer/python/UniqueFilter.py
new file mode 100755
index 0000000..c35ce58
--- /dev/null
+++ b/atac-driver/chainer/python/UniqueFilter.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env python
+
+import os, sys, time, tempfile
+import MyFile
+import MatchRecord
+import AtacFile
+
+def xorIntervals( inpname, outname):
+ # not tested yet
+ leftPicket = 0
+ rghtPicket = 0
+ inpfile = open(inpname,"r")
+ outfile = open(outname,"w")
+ for line in inpfile:
+ fields = line.split()
+ newstart = int(fields[0])
+ newend = int(fields[1])
+ assert(leftPicket <= newstart)
+ rghtSide = min(newstart,rghtPicket)
+ if rghtSide > leftPicket:
+ # interval has positive length
+ print >>outfile, leftPicket, rghtSide
+ leftPicket = max(newstart, min(rightPicket,newend))
+ rghtPicket = max(leftPicket,max(rightPicket,newend))
+
+
+def findUniformCoverageIntervals(inpfile,outfile):
+ # The input records are ("E", id, position, coverage_increment).
+ # The output records are ("C", id, start_position, length, coverage_level).
+ inpfile.seek(0)
+ outfile.seek(0)
+ oldaxis = None; oldposition = 0; cov = 0
+ for line in inpfile:
+ ( recordtype, newaxis, newposition, newchange) = line.split()
+ if(recordtype == "E"):
+ newposition = int(newposition)
+ newchange = int(newchange)
+ if(newaxis != oldaxis and cov != 0): print >>sys.stderr, "Woops"
+ len = newposition - oldposition
+ if(cov>0 and len>0):
+ print >>outfile, "C", oldaxis, oldposition, len, cov;
+ cov += newchange;
+ assert(cov >= 0)
+ oldaxis = newaxis; oldposition = newposition;
+ assert(cov == 0)
+ outfile.flush()
+
+
+def findCoverageIntervals( inpfile, outfile, processFirstAxis):
+ # The input file is an ATAC matches file.
+ # The output file is an ATAC coverage intervals file.
+ inpfile.seek(0)
+ outfile.seek(0)
+ t0 = time.time()
+ tmpfile3 = MyFile.myfile()
+ for line in inpfile:
+ if(line[0]=="M"):
+ fields = line.split()
+ if(fields[1]=="u" or fields[1]=="x"):
+ if(processFirstAxis):
+ axis = fields[4]
+ bgn = int(fields[5])
+ end = bgn+int(fields[6])
+ else:
+ axis = fields[8]
+ bgn = int(fields[9])
+ end = bgn+int(fields[10])
+ print >>tmpfile3, "E", axis,bgn,1
+ print >>tmpfile3, "E", axis,end,-1
+ tmpfile3.close()
+ tmpname = tempfile.mktemp()
+ cmd = "sort -T . -k 1,1 -k 2,2 -k 3n -k 4nr %s > %s" % (tmpfile3.name, tmpname)
+ print >>sys.stderr, cmd
+ iret = os.system(cmd); assert(iret==0)
+ print >>sys.stderr,"time elapsed is ", (time.time() - t0)
+ tmpfile4 = open(tmpname)
+ t0 = time.time()
+ findUniformCoverageIntervals( tmpfile4, outfile)
+ print >>sys.stderr,"time elapsed is ", (time.time() - t0)
+ tmpfile4.close()
+ os.system("rm -f " + tmpname)
+ outfile.seek(0)
+
+
+def applyOneKeepMask( inpfile, outfile, keepMaskFile, processFirstAxis):
+ # Note that the following merge-like control structure is
+ # influenced by the function property of keep intevals to matches.
+
+ debug = 0
+ inpfile.seek(0)
+ outfile.seek(0)
+ keepMaskFile.seek(0)
+
+ # Put the first valid match record into FM. Each input ATAC match
+ # record produces zero, one or more output ATAC matches.
+ FM = None;
+ ma = None; ms = None; me = None
+ qa = None; qs = None; ql = None;
+
+ # the set of masking intervals, using the q variables and iline
+ maskiter = iter(keepMaskFile)
+
+ # the set of masked matches using the m variables and mline
+ inpiter = iter(inpfile)
+
+ iline = None
+ mline = None
+
+ last_matchid = None; subcount = 0
+
+ try: # StopIteration exception from either iterator gets us out
+ while 1:
+ if(iline == None):
+ iline = maskiter.next()
+ (subtype, qa, qs, ql, cov, ) = iline.split()
+ assert(subtype=='C')
+ cov= int(cov)
+ if(cov != 1):
+ iline = None
+ continue
+ qs = int(qs)
+ ql = int(ql)
+ qe = qs + ql
+
+ if(mline == None):
+ mline = inpiter.next()
+ if(mline[0] != 'M'):
+ # not a match record, so just pass it through
+ print >>outfile, mline,
+ mline = None
+ continue
+ FM = MatchRecord.MatchRecord(mline)
+ assert(FM.subtype == "u" or FM.subtype == "x")
+ if(processFirstAxis):
+ ma = FM.x_scaf_uid
+ ms = FM.x_start # match start
+ me = ms + FM.x_length # match end
+ else:
+ ma = FM.y_scaf_uid
+ ms = FM.y_start # match start
+ me = ms + FM.y_length # match end
+
+ # holding valid iline and mline data now
+
+ if not(ma==qa):
+ # not on same axis, need to get a new one
+ if(ma < qa):
+ mline = None
+ else:
+ iline = None
+
+ elif not( (ms < qe) and (qs < me) ):
+ # we are not overlapping, need to get a new one of them
+ if(ms < qs):
+ mline = None
+ else:
+ iline = None
+
+ else:
+ # processing for overlaps
+ FT = FM.copy()
+ mx = max(ms,qs)
+ mn = min(me,qe)
+ trimFromStart = mx - ms
+ trimFromEnd = me - mn
+ trimmedLength = mn - mx
+ if( FT.x_orientation == FT.y_orientation):
+ FT.x_start += trimFromStart
+ FT.y_start += trimFromStart
+ else:
+ if(processFirstAxis):
+ FT.x_start += trimFromStart
+ FT.y_start += trimFromEnd
+ else:
+ FT.y_start += trimFromStart
+ FT.x_start += trimFromEnd
+ FT.x_length = trimmedLength
+ FT.y_length = trimmedLength
+ if debug:
+ print >>sys.stdout, "# trimmed "
+ print >>sys.stdout, FT
+
+ # We must insure that the match identifier is still unique.
+ if last_matchid == FM.matchid :
+ subcount += 1
+ else:
+ subcount = 0
+ # print >>sys.stderr, last_matchid, FM.matchid, subcount
+ last_matchid = FM.matchid
+
+ if(subcount > 0):
+ if processFirstAxis :
+ FT.matchid = FT.matchid + "x" + str(subcount)
+ else:
+ FT.matchid = FT.matchid + "y" + str(subcount)
+
+ print >>outfile, FT
+ # we need to get a new one
+ if(qe < me):
+ iline = None
+ else:
+ mline = None
+
+ except StopIteration:
+ # If there are any left over non-match lines, then output them!
+ for mline in inpiter:
+ if(mline[0] != "M"):
+ print >>outfile, mline,
+
+def applyBothKeepMasks( inpfile, outfile ):
+
+ # Maybe we can think of a masking implementation where each ATAC match
+ # is treated atomicly. Assume that the keep mask intervals are sorted
+ # by start postition. Assume that the ATAC matches are sorted by start
+ # postion. Assert that all keep mask intervals are non-overlapping and
+ # were cut from only one ATAC match. Thus the mapping from keep mask
+ # intervals is a function. Note that this requires that we do not
+ # coalesce abutting keep mask intervals that originate from multiple
+ # matches. Note this still allows an ATAC match to overlap more than
+ # one keep mask interval. Ignore all keep mask intervals with zero
+ # length their creation has tie breaking problems. See notes on 2003
+ # Jul 29.
+
+ debug = 0
+ debugnum = 0
+ inpfile.seek(0)
+ outfile.seek(0)
+
+
+ # Apply the keepMask for the first axis.
+ # Make the sorted the keep mask intervals for the first axis.
+ processFirstAxis = 1
+ keepMaskFile = MyFile.myfile()
+ tmpfile2 = inpfile
+ tmpfile3 = MyFile.myfile()
+ tmpfile4 = MyFile.myfile()
+
+ findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis)
+ if debug:
+ debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
+ for line in keepMaskFile: print >>debugfile, line,
+
+ MatchRecord.sortInXorderAP(tmpfile2,tmpfile3)
+ if debug:
+ #tmpfile2.seek(0)
+ #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
+ #for line in tmpfile2: print >>debugfile, line,
+ tmpfile3.seek(0)
+ debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
+ for line in tmpfile3: print >>debugfile, line,
+
+ applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
+ if debug:
+ tmpfile4.seek(0)
+ debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
+ for line in tmpfile4: print >>debugfile, line,
+
+ # Apply the keepMask for the second axis.
+ # Make the sorted the keep mask intervals for the second axis.
+ processFirstAxis = 0
+ keepMaskFile = MyFile.myfile()
+ tmpfile2 = tmpfile4
+ tmpfile3 = MyFile.myfile()
+ tmpfile4 = outfile
+
+ findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis)
+ if debug:
+ debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
+ for line in keepMaskFile: print >>debugfile, line,
+
+
+ MatchRecord.sortInYorderAP(tmpfile2,tmpfile3)
+ if debug:
+ #tmpfile2.seek(0)
+ #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
+ #for line in tmpfile2: print >>debugfile, line,
+ tmpfile3.seek(0)
+ debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
+ for line in tmpfile3: print >>debugfile, line,
+
+ applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
+ if debug:
+ tmpfile4.seek(0)
+ debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
+ for line in tmpfile4: print >>debugfile, line,
+
+
+def main( inpfile, outfile):
+ applyBothKeepMasks( inpfile, outfile)
+
+ # Should we check if the first and last characters of the masked
+ # matches are matching?
+
+ # Should we compute the percent identity in this module?
+
+
+
+# Allow each module to have its own main for testing.
+if __name__ == '__main__':
+ inpname = sys.argv[1]
+ outname = sys.argv[2]
+ inpfile = open(inpname)
+ outfile = open(outname,"w")
+ main(inpfile, outfile)
+# end if
diff --git a/atac-driver/chainer/python/dedashMatches.py b/atac-driver/chainer/python/dedashMatches.py
new file mode 100755
index 0000000..db45eed
--- /dev/null
+++ b/atac-driver/chainer/python/dedashMatches.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+
+# dedashMatches.py /prod/IR02/synteny/mus-vs-rat/mouse_celera_R13_chr_20030210-vs-rat_celera_R1_chr_20030507-V3.atac.t20.l100.br.squeezed.filled.coalesced mus-vs-rat.out /prod/IR05/GENOMES/mouse_celera_R13_chr_20030210 /prod/IR05/GENOMES/rat_celera_R1_chr_20030507 MR13 RR1
+
+# dedashMatches.py mouse_celera_R13_chr_20030210-vs-rat_celera_R1_chr_20030507-V3.atac.t20.l100.br.squeezed.filled.coalesced mus-vs-rat.out mouse_celera_R13_chr_20030210 rat_celera_R1_chr_20030507 MR13 RR1
+
+
+import sys
+import string
+import time
+import MatchRecord
+import IdxStore
+import halign
+
+#import shelve
+
+class dedasher:
+ def __init__(self,xstr,ystr):
+ pass
+ def __iter__(self):
+ return iter([1])
+
+x = 3
+def suba():
+ global x
+ x = 7
+def subb():
+ global x
+ x -= 1
+ return (x,None)[x == 0]
+def subc():
+ suba()
+ it = iter(subb,None)
+ for y in it:
+ print y
+
+def main( inpfile, outfile, xIdx, yIdx):
+ inpfile.seek(0)
+ outfile.seek(0)
+ lineCount = 0
+ t0 = time.time()
+ for line in inpfile:
+ lineCount += 1
+ if((lineCount % 10000)==0):
+ print >>sys.stderr, "lineCount=",lineCount," time=",time.time()-t0
+ if(line[0] == 'M'):
+ FM = MatchRecord.MatchRecord(line)
+ if(FM.subtype == 'g'):
+ parentid = FM.matchid
+ parent_x_forward = (FM.x_orientation == 1)
+ parent_y_forward = (FM.y_orientation == 1)
+ parent_x_start = FM.x_start
+ parent_y_start = FM.y_start
+ parent_x_length = FM.x_length
+ parent_y_length = FM.y_length
+
+ # Why two orientations and not just a flipped flag?
+ # Because we want the resulting matches to come out in
+ # the same sorted order as the input matches.
+
+ x_substring = string.upper(
+ xIdx.getStringFromFasta( parent_x_forward,
+ FM.x_scaf_uid, FM.x_start, FM.x_length));
+ y_substring = string.upper(
+ yIdx.getStringFromFasta( parent_y_forward,
+ FM.y_scaf_uid, FM.y_start, FM.y_length));
+ ii = 0
+ # Here we call the dedasher.
+ halign.halignStart(x_substring,
+ y_substring)
+ for segment in iter(halign.halignDedash,None):
+ #print >>outfile, segment
+ (bgn1,bgn2,len1,len2,nmat) = segment
+ # Filter by a minimum length? say four bp.
+ ii += 1
+ FM.subtype = 'u'
+ FM.matchid = parentid + 'u' + str(ii)
+ # FM.runid = parentid
+ FM.x_start = parent_x_start + (parent_x_length-bgn1-len1,bgn1)[parent_x_forward]
+ FM.y_start = parent_y_start + (parent_y_length-bgn2-len2,bgn2)[parent_y_forward]
+ FM.x_length = len1
+ FM.y_length = len2
+ assert(len1 == len2)
+ mismatches = 0
+ for ic in range(len1):
+ if(x_seq[bgn1+ic] != y_seq[bgn2+ic]):
+ mismatches += 1
+ FM.extend['mm'] = str(mismatches)
+ FM.identifier = "" # BEWARE
+ print >>outfile, FM
+ else:
+ print >>outfile, line,
+ else:
+ print >>outfile, line,
+
+
+def oldmain():
+ inpname = sys.argv[1]
+ outname = sys.argv[2]
+ xIndexName = sys.argv[3]
+ yIndexName = sys.argv[4]
+ assemblyId1 = sys.argv[5]
+ assemblyId2 = sys.argv[6]
+
+ # mismatches = checkExactMatches( x, y, inpfile)
+ # sys.stderr.write("mismatches = %d\n" % mismatches)
+
+ inpfile = open(inpname)
+ outfile = open(outname,"w")
+ xIdx = IdxStore.IdxStore(xIndexName,assemblyId1)
+ yIdx = IdxStore.IdxStore(yIndexName,assemblyId2)
+
+ main( inpfile, outfile, xIdx, yIdx)
+ outfile.close()
+
+import AtacFile
+import MyFile
+
+def newmain():
+ inpname = sys.argv[1]
+ outname = sys.argv[2]
+
+ obj = AtacFile.AtacFile(inpname)
+ xname = obj.globals["assemblyFilePrefix1"]
+ yname = obj.globals["assemblyFilePrefix1"]
+ assemblyId1 = obj.globals["assemblyId1"]
+ assemblyId2 = obj.globals["assemblyId2"]
+
+
+ xIdx = IdxStore.IdxStore(xname,assemblyId1)
+ yIdx = IdxStore.IdxStore(yname,assemblyId2)
+
+ inpfile = obj.matches
+ outfile = MyFile.myfile()
+ main( inpfile, outfile, xIdx, yIdx)
+ obj.matches = outfile
+ obj.checkpoint(outname)
+ outfile.close()
+
+
+# Allow each module to have its own main for testing.
+if __name__ == '__main__':
+ newmain()
+# end if
diff --git a/atac-driver/chainer/python/fillIntraRunGaps.py b/atac-driver/chainer/python/fillIntraRunGaps.py
new file mode 100644
index 0000000..68b66ae
--- /dev/null
+++ b/atac-driver/chainer/python/fillIntraRunGaps.py
@@ -0,0 +1,362 @@
+#!/usr/bin/env python
+
+import sys
+import string
+import MatchRecord
+import IdxStore
+import localAlignerInterface
+import halign
+#import shelve
+
+# True=1
+False=0
+
+def analyzeGap(x,y,left,right,outfile,maxgap,erate,margin):
+ inter_run_gap_count = 0
+
+ x_pos = 0
+ x_len = 0
+ y_pos = 0
+ y_len = 0
+ if( (left.x_scaf_uid == right.x_scaf_uid) and
+ (left.y_scaf_uid == right.y_scaf_uid) and
+ (left.runid == right.runid) ):
+ # (left.sindex + 1 == right.sindex) ): # This is obsolete in Russell's file format.
+ # sys.stderr.write("Intra-run gap\n")
+ left_forward = (left.x_orientation == left.y_orientation)
+ right_forward = (right.x_orientation == right.y_orientation)
+ if( left_forward != right_forward): sys.stderr.write("Bad orientations\n")
+ assert(left_forward == right_forward)
+
+ sorted_by_x = (left.x_start <= right.x_start) and \
+ (left.x_start+left.x_length <= right.x_start+right.x_length)
+ sorted_by_y = (left.y_start <= right.y_start) and \
+ (left.y_start+left.y_length <= right.y_start+right.y_length)
+ if(not(sorted_by_x or sorted_by_y)):
+ print >>sys.stderr, "bad sorting in runs"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ assert(sorted_by_x or sorted_by_y)
+ # This concept of sorted allows neggaps but not containmant in both axes.
+
+ if(not((not left_forward) or (sorted_by_x and sorted_by_y))):
+ print >>sys.stderr, "bad sorting in runs"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ if(not((left_forward) or (not(sorted_by_x and sorted_by_y)))):
+ print >>sys.stderr, "bad sorting in runs"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ assert((not left_forward) or (sorted_by_x and sorted_by_y))
+ assert((left_forward) or (not(sorted_by_x and sorted_by_y)))
+
+ if(sorted_by_x): # Sorted by x positions.
+ x_pos = left.x_start + left.x_length # Start of the intra-run gap.
+ x_len = right.x_start - x_pos # Length of the intra-run gap.
+ if(left_forward):
+ y_pos = left.y_start + left.y_length
+ y_len = right.y_start - y_pos
+ else:
+ y_pos = right.y_start + right.y_length
+ y_len = left.y_start - y_pos
+ # end if
+ else: # Assume sorted by y positions
+ y_pos = left.y_start + left.y_length
+ y_len = right.y_start - y_pos
+ if(left_forward):
+ x_pos = left.x_start + left.x_length
+ x_len = right.x_start - x_pos
+ else:
+ x_pos = right.x_start + right.x_length
+ x_len = left.x_start - x_pos
+ # end if
+ # end if
+ # print "Left %d,%d Right %d,%d Width %d,%d" % (x_pos,y_pos,x_pos+x_len,y_pos+y_len,x_len,y_len)
+
+ assert(left.x_start >= 0)
+ assert(left.x_length > 0)
+ assert(left.y_start >= 0)
+ assert(left.y_length > 0)
+ assert(right.x_start >= 0)
+ assert(right.x_length > 0)
+ assert(right.y_start >= 0)
+ assert(right.y_length > 0)
+
+
+ if( 1
+ and 0 < x_len and 0 < y_len
+ and x_len < maxgap and y_len < maxgap
+ ):
+
+ if 0:
+ sys.stderr.write("About to call local aligner with %d margins\n" % margin);
+ sys.stderr.write("# left = %s\n" % str(left))
+ sys.stderr.write("# right= %s\n" % str(right))
+ sys.stderr.write("x_len=%d y_len=%d\n" % (x_len, y_len) );
+
+ # Why two orientation flags? We want the output matches
+ # to be in the same sorted order as the left and right
+ # matches.
+
+ parent_x_start = x_pos - margin
+ parent_y_start = y_pos - margin
+ parent_x_length = x_len + 2*margin
+ parent_y_length = y_len + 2*margin
+
+ if 0:
+ print >>sys.stderr, "parent_x_start=%d" % parent_x_start
+ print >>sys.stderr, "parent_y_start=%d" % parent_y_start
+ print >>sys.stderr, "parent_x_length=%d" % parent_x_length
+ print >>sys.stderr, "parent_y_length=%d" % parent_y_length
+
+ x_seq = ""
+ if(x_len > 0):
+ x_seq = string.upper(
+ x.getStringFromFasta( sorted_by_x, left.x_scaf_uid,
+ parent_x_start, parent_x_length));
+ # end if
+ y_seq = ""
+ if(y_len > 0):
+ y_seq = string.upper(
+ y.getStringFromFasta( sorted_by_y, left.y_scaf_uid,
+ parent_y_start, parent_y_length));
+ # end if
+
+ if 0:
+ print >>outfile, "# STARTED localAlignerInterface.syntenicSegments"
+ print >>outfile, "# left = %s" % str(left)
+ print >>outfile, "# right= %s" % str(right)
+ print >>sys.stderr, "x_seq="+x_seq
+ print >>sys.stderr, "len(x_seq)=",len(x_seq)
+ print >>sys.stderr, "y_seq="+y_seq
+ print >>sys.stderr, "len(y_seq)=",len(y_seq)
+ outfile.flush()
+
+ try:
+ localAlignerInterface.syntenicSegments(outfile,
+ x_seq, 0, parent_x_length,
+ y_seq, 0, parent_y_length,
+ erate)
+
+ FM = left
+ parent_id = FM.matchid
+ #FM.x_orientation = sorted_by_x
+ #FM.y_orientation = sorted_by_y
+
+ # Why two orientations and not just a flipped flag?
+ # Because we want the resulting matches to come out in
+ # the same sorted order as the input matches.
+
+ ii = 0
+ for segment in iter(localAlignerInterface.iterateSegments,None):
+ #print >>outfile, segment
+ (bgn1,bgn2,len1,len2,fid) = segment
+ assert(len1 >= 0)
+ assert(len2 >= 0)
+ assert(bgn1 >= 0)
+ assert(bgn2 >= 0)
+ if(not(bgn1 + len1 <= parent_x_length)):
+ print >>sys.stdout,"# warn(not(bgn1 + len1 <= parent_x_length))"
+ print >>sys.stdout,"# bgn1=%d len1=%d parent_x_length=%d" % (bgn1,len1,parent_x_length)
+ print >>sys.stdout,"# left = %s" % str(left)
+ print >>sys.stdout,"# right= %s" % str(right)
+ print >>sys.stdout,"# bgn1,bgn2,len1,len2=", bgn1,bgn2,len1,len2
+ #print >>sys.stdout,"# xseq=%s" % x_seq[bgn1:bgn1+len1]
+ #print >>sys.stdout,"# yseq=%s" % y_seq[bgn2:bgn2+len2]
+ len1 = parent_x_length - bgn1
+ print >>sys.stdout, "# Change len1 = %d" % len1
+ if(not(bgn2 + len2 <= parent_y_length)):
+ print >>sys.stdout,"# warn(not(bgn2 + len2 <= parent_y_length))"
+ print >>sys.stdout,"# bgn2=%d len2=%d parent_y_length=%d" % (bgn2,len2,parent_y_length)
+ print >>sys.stdout,"# left = %s" % str(left)
+ print >>sys.stdout,"# right= %s" % str(right)
+ print >>sys.stdout,"# bgn1,bgn2,len1,len2=", bgn1,bgn2,len1,len2
+ #print >>sys.stdout,"# xseq=%s" % x_seq[bgn1:bgn1+len1]
+ #print >>sys.stdout,"# yseq=%s" % y_seq[bgn2:bgn2+len2]
+ len2 = parent_y_length - bgn2
+ print >>sys.stdout,"# Change len2 = %d" % len2
+ if (len1 == 0):
+ print >>sys.stdout,"# warn(len1 == 0)"
+ print >>sys.stdout,"# bgn1,bgn2,len1,len2=", bgn1,bgn2,len1,len2
+ print >>sys.stdout,"# bgn1=%d len1=%d parent_x_length=%d" % (bgn1,len1,parent_x_length)
+ continue
+ if (len2 == 0):
+ print >>sys.stdout,"# warn(len2 == 0)"
+ print >>sys.stdout,"# bgn1,bgn2,len1,len2=", bgn1,bgn2,len1,len2
+ print >>sys.stdout,"# bgn2=%d len2=%d parent_y_length=%d" % (bgn2,len2,parent_y_length)
+ continue
+ assert(bgn1 >= 0)
+ assert(bgn2 >= 0)
+ assert(len1 > 0)
+ assert(len2 > 0)
+ assert(bgn1 + len1 <= parent_x_length);
+ assert(bgn2 + len2 <= parent_y_length);
+ # Filter by a minimum length? say four bp.
+ ii += 1
+ FM.subtype = 'l'
+ FM.matchid = parent_id + 'l' + str(ii)
+ # FM.runid = parent_id
+ child_x_start = parent_x_start + (parent_x_length-bgn1-len1,bgn1)[sorted_by_x]
+ child_y_start = parent_y_start + (parent_y_length-bgn2-len2,bgn2)[sorted_by_y]
+ child_x_length = len1
+ child_y_length = len2
+
+ #FM.identifier = " %f" % fid # CMM BEWARE
+ FM.x_start = child_x_start
+ FM.y_start = child_y_start
+ FM.x_length = child_x_length
+ FM.y_length = child_y_length
+ #FM.extend['fid'] = str(fid)
+ #print >>outfile, FM
+
+ # Here we call the dedasher.
+ #assert(len1 > 0)
+ #assert(len2 > 0)
+ #assert(bgn1 >= 0)
+ #assert(bgn2 >= 0)
+ #assert(bgn1+len1 <= parent_x_length)
+ #assert(bgn2+len2 <= parent_y_length)
+
+ if 0:
+ print >>sys.stderr, "# x_seq=%s" % x_seq
+ print >>sys.stderr, "# y_seq=%s" % y_seq
+ print >>sys.stderr, "# bgn1,bgn2,len1,len2=", bgn1,bgn2,len1,len2
+ print >>sys.stderr, "# xseq=%s" % x_seq[bgn1:bgn1+len1]
+ print >>sys.stderr, "# yseq=%s" % y_seq[bgn2:bgn2+len2]
+
+ halign.halignStart(x_seq[bgn1:bgn1+len1], y_seq[bgn2:bgn2+len2])
+ outfile.flush()
+ for hsegment in iter(halign.halignDedash,None):
+ #print >>outfile, segment
+ (bgn1h,bgn2h,len1h,len2h,nmat) = hsegment
+ # Filter by a minimum length? say four bp.
+ ii += 1
+ FM.subtype = 'u'
+ FM.matchid = parent_id + 'a' + str(ii)
+ # FM.runid = parent_id
+ FM.x_start = child_x_start + (child_x_length-bgn1h-len1h,bgn1h)[sorted_by_x]
+ FM.y_start = child_y_start + (child_y_length-bgn2h-len2h,bgn2h)[sorted_by_y]
+ FM.x_length = len1h
+ FM.y_length = len2h
+
+ assert(len1h == len2h)
+ mismatches = 0
+ for ic in range(len1h):
+ if(x_seq[bgn1+bgn1h+ic] != y_seq[bgn2+bgn2h+ic]):
+ mismatches += 1
+ FM.extend['mm'] = str(mismatches)
+ #FM.identifier = "" # BEWARE
+ print >>outfile, FM
+
+ # localAlignerInterface.free()
+ # print >>outfile,"# FINISHED localAlignerInterface.syntenicSegments"
+ except RuntimeError:
+ print >>outfile, "# NOTE syntenicSegments failed between these records"
+ print >>outfile, "# STARTED localAlignerInterface.syntenicSegments"
+ print >>outfile, "# left = %s" % str(left)
+ print >>outfile, "# right= %s" % str(right)
+ print >>sys.stderr, "NOTE syntenicSegments failed in fillIntraRunGaps for:"
+ print >>sys.stderr, "x_seq="+x_seq
+ print >>sys.stderr, "len(x_seq)=",len(x_seq)
+ print >>sys.stderr, "y_seq="+y_seq
+ print >>sys.stderr, "len(y_seq)=",len(y_seq)
+ # end if
+
+ else:
+ # sys.stderr.write("Inter-run gap\n")
+ inter_run_gap_count += 1
+ # sys.stderr.write("done\n")
+ return (inter_run_gap_count,)
+# end def
+
+
+def mainLoop( inpfile, outfile, xIdx, yIdx, maxgap, erate):
+
+ margin = 20 # This should be set by an ATAC global.
+ countLines = 0
+ inter_run_gap_count_total = 0
+ closed_gap_count_total = 0
+ squeezed_total = 0
+ x_len_total = 0
+ y_len_total = 0
+ x_nonACGT_total = 0
+ y_nonACGT_total = 0
+
+ inpfile.seek(0)
+ inpfileIter = iter(inpfile)
+
+ sys.stderr.write("begin\n")
+
+ left = None
+ for line in inpfileIter:
+ if(line[0] == 'M'):
+ left = MatchRecord.MatchRecord(line)
+ print >>outfile, left
+ countLines += 1
+ break;
+
+ sys.stderr.write("countLines=%d\n" % countLines)
+
+ for line in inpfileIter:
+ if(line[0] == 'M'):
+ newRight = MatchRecord.MatchRecord(line)
+ if( newRight.subtype == 'u' ):
+ right = newRight
+
+ #if( countLines % 10000 == 0):
+ # sys.stderr.write("countLines=%d\n" % countLines)
+
+ (inter_run_gap_count,) = analyzeGap( xIdx, yIdx, left,right, outfile, maxgap, erate, margin)
+
+ inter_run_gap_count_total += inter_run_gap_count
+
+ # Output the record which was possibly trimmed.
+ print >>outfile, right
+ countLines += 1
+ left = right
+ # end if
+ # end for
+
+ sys.stderr.write("countLines %d inter_run_gap_count %d \n" % (countLines,inter_run_gap_count_total))
+# end def
+
+import AtacFile
+import MyFile
+
+def main( inpname, outname):
+ obj = AtacFile.AtacFile(inpname)
+ assemblyId1 = obj.globals['assemblyId1']
+ assemblyId2 = obj.globals['assemblyId2']
+ assemblyFilePrefix1 = obj.globals['assemblyFilePrefix1']
+ assemblyFilePrefix2 = obj.globals['assemblyFilePrefix2']
+
+ if(not obj.globals.has_key('fillIntraRunGapsErate')):
+ obj.globals['fillIntraRunGapsErate'] = 0.10
+ if(not obj.globals.has_key('fillIntraRunGapsMaxGap')):
+ obj.globals['fillIntraRunGapsMaxGap'] = 100000
+ fillIntraRunGapsErate = float(obj.globals['fillIntraRunGapsErate'])
+ fillIntraRunGapsMaxGap = int(obj.globals['fillIntraRunGapsMaxGap'])
+
+ # mismatches = checkExactMatches( x, y, inpfile)
+ # sys.stderr.write("mismatches = %d\n" % mismatches)
+
+ xIdx = IdxStore.IdxStore(assemblyFilePrefix1,assemblyId1)
+ yIdx = IdxStore.IdxStore(assemblyFilePrefix2,assemblyId2)
+
+ tempfile = MyFile.myfile()
+ mainLoop( obj.matches, tempfile, xIdx, yIdx,
+ fillIntraRunGapsMaxGap, fillIntraRunGapsErate)
+ obj.matches = tempfile
+ obj.checkpoint(outname)
+
+
+# Allow each module to have its own main for testing.
+if __name__ == '__main__':
+ inpname = sys.argv[1]
+ outname = sys.argv[2]
+ main( inpname, outname)
+# end if
+
+
+
+
diff --git a/atac-driver/chainer/python/mkstats.py b/atac-driver/chainer/python/mkstats.py
new file mode 100755
index 0000000..daf7cc6
--- /dev/null
+++ b/atac-driver/chainer/python/mkstats.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# Must look in /usr/local/ir/bin on the Compaqs for the correct Python interpreter.
+# export PYTHONPATH=${PYTHONPATH}:$WORK/cds/IR/COMPASS/src/AtacPipeline
+
+"""
+Extensive documentation for the Python language is available at
+http://www.python.org.
+"""
+
+import os, sys, re, tempfile
+
+def main(glist):
+ for inpname in glist:
+
+ if(0):
+ inpfile = open(inpname,'r')
+ tmpname = tempfile.mktemp(".tmp")
+ tmpfile = open(tmpname,'w')
+ pattern = re.compile(r"^M [gl] ")
+ for line in inpfile:
+ if(pattern.search(line)):
+ print >>tmpfile, line,
+ tmpfile.close()
+ os.system("celagram -c 7 -t 'gapped match lengths' %s" % (tmpname,))
+
+ if(0):
+ inpfile = open(inpname,'r')
+ tmpname = tempfile.mktemp(".tmp")
+ tmpfile = open(tmpname,'w')
+ pattern = re.compile(r"^M x ")
+ for line in inpfile:
+ if(pattern.search(line)):
+ print >>tmpfile, line,
+ tmpfile.close()
+ os.system("celagram -c 7 -t 'exact match lengths' %s" % (tmpname,))
+
+ if(0):
+ inpfile = open(inpname,'r')
+ tmpname = tempfile.mktemp(".tmp")
+ tmpfile = open(tmpname,'w')
+ pattern = re.compile(r"^M u ")
+ for line in inpfile:
+ if(pattern.search(line)):
+ print >>tmpfile, line,
+ tmpfile.close()
+ os.system("celagram -c 7 -t 'ungapped match lengths' %s" % (tmpname,))
+
+ inpfile = open(inpname,'r')
+ tmpname = tempfile.mktemp(".tmp")
+ tmpfile = open(tmpname,'w')
+ # pattern = re.compile(r"^M\s*[xu]\s")
+ pattern = re.compile(r"^M [xu] ")
+ for line in inpfile:
+ if(pattern.search(line)):
+ print >>tmpfile, line,
+ tmpfile.close()
+ os.system("celagram -c 7 -t '%s ungapped match lengths' %s" % (inpname,tmpname))
+
+
+ inpfile = open(inpname,'r')
+ tmpname = tempfile.mktemp(".tmp")
+ tmpfile = open(tmpname,'w')
+ pattern = re.compile(r"^M\s*r\s")
+ for line in inpfile:
+ if(pattern.search(line)):
+ print >>tmpfile, line,
+ tmpfile.close()
+ os.system("celagram -c 7 -t '%s spans in 1st assembly' %s" % (inpname,tmpname))
+ os.system("celagram -c 11 -t '%s spans in 2nd assembly' %s" % (inpname,tmpname))
+
+
+
+if __name__ == '__main__':
+ #glist = [ "humR27vsB31-V2.atac", "humB31vsVAN-V1.atac", "humB31vsSC-V3.atac", ]
+
+ main(sys.argv[1:])
diff --git a/atac-driver/chainer/python/squeezeIntraRunGaps.py b/atac-driver/chainer/python/squeezeIntraRunGaps.py
new file mode 100644
index 0000000..81719c2
--- /dev/null
+++ b/atac-driver/chainer/python/squeezeIntraRunGaps.py
@@ -0,0 +1,512 @@
+#!/usr/bin/env python
+
+import sys
+import string
+import MatchRecord
+#import MyFile
+import IdxStore
+#import localAlignerInterface
+#import shelve
+
+# True=1
+False=0
+
+
+theIsolatedSNPcount = 0
+completefillednotXY = 0
+completefilledXnotY = 0
+completefilledYnotX = 0
+completefilledXandY = 0
+
+def analyzeGap(x,y,left,right,outfile,maxgap,margin):
+ global theIsolatedSNPcount
+ global completefillednotXY
+ global completefilledXnotY
+ global completefilledYnotX
+ global completefilledXandY
+ solidThreshold=20
+ inter_run_gap_count = 0
+ x_chCount = {}
+ y_chCount = {}
+ x_notACGT = 0
+ y_notACGT = 0
+
+ lp = 0 # We should modify the match instead!
+ rp = 0
+ x_pos = 0
+ x_len = 0
+ y_pos = 0
+ y_len = 0
+ if( (left.x_scaf_uid == right.x_scaf_uid) and
+ (left.y_scaf_uid == right.y_scaf_uid) and
+ (left.runid == right.runid) ):
+ # (left.sindex + 1 == right.sindex) ): # This is obsolete in Russell's file format.
+ # sys.stderr.write("Intra-run gap\n")
+ left_forward = (left.x_orientation == left.y_orientation)
+ right_forward = (right.x_orientation == right.y_orientation)
+ if( left_forward != right_forward): sys.stderr.write("Bad orientations in run\n")
+ assert(left_forward == right_forward)
+
+ sorted_by_x = (left.x_start <= right.x_start)
+ dovetail_in_x = sorted_by_x and (left.x_start+left.x_length <= right.x_start+right.x_length)
+ sorted_by_y = (left.y_start <= right.y_start)
+ dovetail_in_y = sorted_by_y and (left.y_start+left.y_length <= right.y_start+right.y_length)
+ if(not(sorted_by_x or sorted_by_y)):
+ print >>sys.stderr, "bad sorting in runs"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ assert(sorted_by_x or sorted_by_y)
+ # This concept of sorted allows neggaps but not containmant in both axes.
+
+ if(sorted_by_x and not dovetail_in_x):
+ print >>sys.stderr, "sorted_by_x and not dovetail_in_x"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+
+ if(sorted_by_y and not dovetail_in_y):
+ print >>sys.stderr, "sorted_by_y and not dovetail_in_y"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+
+ if(not((not left_forward) or (sorted_by_x and sorted_by_y))):
+ print >>sys.stderr, "bad sorting in runs"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ if(not((left_forward) or (not(sorted_by_x and sorted_by_y)))):
+ print >>sys.stderr, "bad sorting in runs"
+ print >>sys.stderr, left
+ print >>sys.stderr, right
+ assert((not left_forward) or (sorted_by_x and sorted_by_y))
+ assert((left_forward) or (not(sorted_by_x and sorted_by_y)))
+
+ if(sorted_by_x): # Sorted by x positions.
+ #print "Sorted by X"
+ x_pos = left.x_start + left.x_length # Start of the intra-run gap.
+ x_len = right.x_start - x_pos # Length of the intra-run gap.
+ if(left_forward):
+ y_pos = left.y_start + left.y_length
+ y_len = right.y_start - y_pos
+ else:
+ y_pos = right.y_start + right.y_length
+ y_len = left.y_start - y_pos
+ # end if
+ else: # Assume sorted by y positions
+ #print "Sorted by Y"
+ y_pos = left.y_start + left.y_length
+ y_len = right.y_start - y_pos
+ if(left_forward):
+ x_pos = left.x_start + left.x_length
+ x_len = right.x_start - x_pos
+ else:
+ x_pos = right.x_start + right.x_length
+ x_len = left.x_start - x_pos
+ # end if
+ # end if
+ # print "Left %d,%d Right %d,%d Width %d,%d" % (x_pos,y_pos,x_pos+x_len,y_pos+y_len,x_len,y_len)
+
+ assert(left.x_start >= 0)
+ assert(left.x_length > 0)
+ assert(left.y_start >= 0)
+ assert(left.y_length > 0)
+ assert(right.x_start >= 0)
+ assert(right.x_length > 0)
+ assert(right.y_start >= 0)
+ assert(right.y_length > 0)
+
+ # Trim the intra-run neggaps to become proper gaps.
+ if(0 and (x_len < 0 or y_len < 0)):
+ sys.stderr.write("neggap x_uid= %s x_pos= %d x_len= %d y_uid= %s y_pos= %d y_len= %d\n" %
+ (left.x_scaf_uid,x_pos,x_len,left.y_scaf_uid,y_pos,y_len))
+ trim_len = max( -x_len, -y_len)
+ # Increase the intra-run gap length:
+ x_len += trim_len; y_len += trim_len
+ # Decrease the right-hand match length:
+ right.x_length -= trim_len
+ right.y_length -= trim_len
+ # Adjust the right-hand gap ending position:
+ if(left_forward):
+ right.x_start += trim_len
+ right.y_start += trim_len
+ else:
+ if(sorted_by_x):
+ right.x_start += trim_len
+ y_pos -= trim_len
+ else: # assume sorted_by_y
+ right.y_start += trim_len
+ x_pos -= trim_len
+ # end if
+ #end if
+ sys.stderr.write("newgap x_uid= %s x_pos= %d x_len= %d y_uid= %s y_pos= %d y_len= %d\n" %
+ (left.x_scaf_uid,x_pos,x_len,left.y_scaf_uid,y_pos,y_len))
+ # end if
+ assert(right.x_length > 0)
+ assert(right.y_length > 0)
+ # We now have a proper intra-run gap segment between two match segments.
+
+ x_substring = ""
+ if(x_len > 0):
+ x_substring = string.upper(
+ x.getStringFromFasta( sorted_by_x, left.x_scaf_uid, x_pos, x_len));
+ # end if
+ y_substring = ""
+ if(y_len > 0):
+ y_substring = string.upper(
+ y.getStringFromFasta( sorted_by_y, left.y_scaf_uid, y_pos, y_len));
+ # end if
+ if(x_len > 0 and not(x_len == len(x_substring))):
+ sys.stderr.write("x string lengths mismatch asked=%d got=%d\n" % (x_len,len(x_substring)))
+ sys.stderr.write("x_uid= %s x_pos= %d x_len= %d y_uid= %s y_pos= %d y_len= %d\n" %
+ (left.x_scaf_uid,x_pos,x_len,left.y_scaf_uid,y_pos,y_len))
+ print >>sys.stderr, "left match"
+ print >>sys.stderr, left
+ print >>sys.stderr, "right match"
+ print >>sys.stderr, right
+ # end if
+ if(y_len > 0 and not(y_len == len(y_substring))):
+ sys.stderr.write("y string lengths mismatch asked=%d got=%d\n" % (y_len,len(y_substring)))
+ sys.stderr.write("x_uid= %s x_pos= %d x_len= %d y_uid= %s y_pos= %d y_len= %d\n" %
+ (left.x_scaf_uid,x_pos,x_len,left.y_scaf_uid,y_pos,y_len))
+ print >>sys.stderr, "left match"
+ print >>sys.stderr, left
+ print >>sys.stderr, "right match"
+ print >>sys.stderr, right
+ # end if
+
+ assert(x_len < 0 or x_len == len(x_substring))
+ assert(y_len < 0 or y_len == len(y_substring))
+ assert(lp == 0)
+ assert(rp == 0)
+
+ # Next we extend the raw matches to squeeze the intra-run gaps
+ # with exactly matching sequence.
+ if( lp+rp < x_len and lp+rp < y_len ):
+ while(lp+rp < x_len and lp+rp < y_len):
+ # modify lp
+ x_ch = x_substring[lp]; y_ch = y_substring[lp];
+ is_a_match = (x_ch==y_ch) and \
+ (x_ch=="A" or x_ch=="C" or x_ch=="G" or x_ch=="T")
+ if(is_a_match):
+ lp += 1
+ else:
+ break
+ # end if
+ # end while
+ while(lp+rp < x_len and lp+rp < y_len):
+ # modify rp
+ x_ch = x_substring[-1-rp]; y_ch = y_substring[-1-rp];
+ is_a_match = (x_ch==y_ch) and \
+ (x_ch=="A" or x_ch=="C" or x_ch=="G" or x_ch=="T")
+ if(is_a_match):
+ rp += 1
+ else:
+ break
+ # end if
+ # end while
+ # end if
+
+ # Next we extend the raw matches to squeeze the intra-run gaps.
+ # Each mismatch character must be padded on both sides by
+ # "solidThreshold" characters from {A,C,G,T}.
+ if( x_len > lp+rp and y_len > lp+rp ):
+ lq = lp; solid = solidThreshold; tentativeSNPCount = 0
+ while(lq+rp < x_len and lq+rp < y_len):
+ x_ch = x_substring[lq]; y_ch = y_substring[lq];
+ is_a_match = (x_ch==y_ch) and \
+ (x_ch=="A" or x_ch=="C" or x_ch=="G" or x_ch=="T")
+ if(solid >= solidThreshold):
+ lp = lq
+ theIsolatedSNPcount += tentativeSNPCount
+ tentativeSNPCount = 0
+ if(is_a_match):
+ solid += 1
+ lp += 1
+ else:
+ solid = 0
+ tentativeSNPCount = 1
+ tentativeSNPposition = lq
+ # end if
+ else:
+ if(is_a_match):
+ solid += 1
+ else: # a second mismatch within 20 bp
+ break
+ # end if
+ # end if
+ lq += 1
+ # end while
+ if(lq+rp == x_len and lq+rp == y_len):
+ lp = lq
+ theIsolatedSNPcount += tentativeSNPCount
+ tentativeSNPCount = 0
+ # end if
+ rq = rp; solid = solidThreshold; tentativeSNPCount = 0
+ while( lp+rq < x_len and lp+rq < y_len ):
+ x_ch = x_substring[-1-rq]; y_ch = y_substring[-1-rq];
+ is_a_match = (x_ch==y_ch) and \
+ (x_ch=="A" or x_ch=="C" or x_ch=="G" or x_ch=="T")
+ if(solid >= solidThreshold):
+ rp = rq
+ theIsolatedSNPcount += tentativeSNPCount
+ tentativeSNPCount = 0
+ if(is_a_match):
+ solid += 1
+ rp += 1
+ else:
+ solid = 0
+ tentativeSNPCount = 1
+ # end if
+ else:
+ if(is_a_match):
+ solid += 1
+ else: # a second mismatch within 20 bp
+ break
+ # end if
+ # end if
+ rq += 1
+ # end while
+ if( lp+rq == x_len and lp+rq == y_len ):
+ rp = rq
+ theIsolatedSNPcount += tentativeSNPCount
+ tentativeSNPCount = 0
+ # end if
+ # end if
+
+ # Next we close any remaining intra-run gaps that can form
+ # ungapped alignments of a specified high quality.
+ # Currently we have hard coded that there must be
+ # 5 or less mismatches or
+ # better than 95% identity
+ # in the intrarun gap remaining after the previous gap closing.
+ assert(lp >= 0)
+ assert(rp >= 0)
+ if( x_len == y_len and x_len > lp+rp):
+ lq = lp; mismatchCount = 0
+ while(lq+rp < x_len and lq+rp < y_len):
+ x_ch = x_substring[lq]; y_ch = y_substring[lq];
+ is_a_match = (x_ch==y_ch) and \
+ (x_ch=="A" or x_ch=="C" or x_ch=="G" or x_ch=="T")
+ if(not is_a_match): mismatchCount += 1
+ lq += 1
+ # end while
+ if(mismatchCount <= 5 or
+ mismatchCount <= 0.05*(x_len-lp-rp) ):
+ lp=lq
+ # sys.stderr.write("# Closed gap by jumping\n");
+ # end if
+ # end if
+
+ if(0):
+ sys.stderr.write( "rawX: %s\n" % x_substring)
+ sys.stderr.write( "rawY: %s\n" % y_substring)
+ sys.stderr.write( "squX: %s\n" % x_substring[:lp]+ \
+ string.lower(x_substring[lp:x_len-rp]) + \
+ x_substring[x_len-rp:x_len])
+ sys.stderr.write( "squY: %s\n" % y_substring[:lp]+ \
+ string.lower(y_substring[lp:y_len-rp]) + \
+ y_substring[y_len-rp:y_len])
+ sys.stderr.write( "sqeX: %s\n" % x_substring[lp:x_len-rp])
+ sys.stderr.write( "sqeY: %s\n" % y_substring[lp:y_len-rp])
+ sys.stderr.write( "x_seg=(%s %s %d %d)" % \
+ (left.x_orientation, left.x_scaf_uid, x_pos+lp, x_len-rp-lp))
+ sys.stderr.write( "y_seg=(%s %s %d %d)\n" % \
+ (left.y_orientation, left.y_scaf_uid, y_pos+lp, y_len-rp-lp))
+
+ if(lp+rp > x_len and x_len >= 0):
+ sys.stderr.write("overfilledX ")
+ sys.stderr.write( "x_seg=(%s %s %d %d) " % \
+ (left.x_orientation, left.x_scaf_uid, x_pos+lp, x_len-rp-lp))
+ sys.stderr.write( "y_seg=(%s %s %d %d)\n" % \
+ (left.y_orientation, left.y_scaf_uid, y_pos+lp, y_len-rp-lp))
+ if(lp+rp > y_len and y_len >= 0):
+ sys.stderr.write("overfilledY ")
+ sys.stderr.write( "x_seg=(%s %s %d %d) " % \
+ (left.x_orientation, left.x_scaf_uid, x_pos+lp, x_len-rp-lp))
+ sys.stderr.write( "y_seg=(%s %s %d %d)\n" % \
+ (left.y_orientation, left.y_scaf_uid, y_pos+lp, y_len-rp-lp))
+ if(lp+rp < x_len and lp+rp < y_len): completefillednotXY += 1
+ if(lp+rp == x_len and x_len < y_len): completefilledXnotY += 1
+ if(lp+rp == y_len and y_len < x_len): completefilledYnotX += 1
+ if(lp+rp == x_len and x_len == y_len): completefilledXandY += 1
+
+ # Print out abutting intervals to fill gaps.
+ if(lp>0):
+ left_fill = left.copy()
+ if(left_forward):
+ left_fill.subtype = "L"
+ left_fill.x_start = x_pos
+ left_fill.x_length = lp
+ left_fill.y_start = y_pos
+ left_fill.y_length = lp
+ left_fill.matchid = left_fill.matchid + "L"
+
+ else:
+ if(sorted_by_x):
+ left_fill.subtype = "L"
+ left_fill.x_start = x_pos
+ left_fill.x_length = lp
+ left_fill.y_start = y_pos+y_len-lp
+ left_fill.y_length = lp
+ left_fill.matchid = left_fill.matchid + "L"
+ else: # assume sorted_by_y
+ left_fill.subtype = "L"
+ left_fill.x_start = x_pos+x_len-lp
+ left_fill.x_length = lp
+ left_fill.y_start = y_pos
+ left_fill.y_length = lp
+ left_fill.matchid = left_fill.matchid + "L"
+ # end if
+ # end if
+ # outfile.write(str(left_fill))
+ print >>outfile, left_fill
+ # end if
+ if(rp>0):
+ right_fill = right.copy()
+ if(left_forward):
+ right_fill.subtype = "R"
+ right_fill.x_start = x_pos+x_len-rp
+ right_fill.x_length = rp
+ right_fill.y_start = y_pos+y_len-rp
+ right_fill.y_length = rp
+ right_fill.matchid = right_fill.matchid + "R"
+ else:
+ if(sorted_by_x):
+ right_fill.subtype = "R"
+ right_fill.x_start = x_pos+x_len-rp
+ right_fill.x_length = rp
+ right_fill.y_start = y_pos
+ right_fill.y_length = rp
+ right_fill.matchid = right_fill.matchid + "R"
+ else: # assume sorted_by_y
+ right_fill.subtype = "R"
+ right_fill.x_start = x_pos
+ right_fill.x_length = rp
+ right_fill.y_start = y_pos+y_len-rp
+ right_fill.y_length = rp
+ right_fill.matchid = right_fill.matchid + "R"
+ # end if
+ # end if
+ #outfile.write(str(right_fill))
+ print >>outfile, right_fill
+ # end if
+
+ if(0): # Start gap composition diagnostics.
+ if( (x_len > lp+rp) or (y_len > lp+rp) ):
+ for ch in x_substring[lp:x_len-rp]:
+ if(not(ch=='A' or ch=='C' or ch=='G' or ch=='T')):
+ x_notACGT += 1
+ try:
+ x_chCount[ch] += 1
+ except KeyError:
+ x_chCount[ch] = 1
+ for ch in y_substring[lp:y_len-rp]:
+ if(not(ch=='A' or ch=='C' or ch=='G' or ch=='T')):
+ y_notACGT += 1
+ try:
+ y_chCount[ch] += 1
+ except KeyError:
+ y_chCount[ch] = 1
+ if(1 or x_notACGT > 0 or y_notACGT > 0):
+ sys.stderr.write("Ncounts %d %d\n" % (x_notACGT,y_notACGT))
+ sys.stderr.write("x_gap_len= %d y_gap_len= %d\n" % (x_len-lp-rp,y_len-lp-rp))
+ sys.stderr.write("x_seg=(%s %s %d %d)\n" % \
+ (left.x_orientation, left.x_scaf_uid, x_pos+lp, x_len-lp-rp))
+ sys.stderr.write("y_seg=(%s %s %d %d)\n" % \
+ (left.y_orientation, left.y_scaf_uid, y_pos+lp, y_len-lp-rp))
+ # sys.stderr.write("x_chCount= ")
+ # sys.stderr.write(x_chCount)
+ # sys.stderr.write("y_chCount= ")
+ # sys.stderr.write( y_chCount);
+ else:
+ # sys.stderr.write("Inter-run gap\n")
+ inter_run_gap_count += 1
+ # sys.stderr.write("done\n")
+ squeezed = lp+rp
+ return (inter_run_gap_count,
+ squeezed,x_len-squeezed,y_len-squeezed,x_notACGT,y_notACGT)
+# end def
+
+def mainLoop( inpfile, outfile, xIdx, yIdx):
+
+ maxgap = 100000 # This should be set by an ATAC global.
+ margin = 20 # This should be set by an ATAC global.
+
+ countLines = 0
+ inter_run_gap_count_total = 0
+ closed_gap_count_total = 0
+ squeezed_total = 0
+ x_len_total = 0
+ y_len_total = 0
+ x_nonACGT_total = 0
+ y_nonACGT_total = 0
+
+ inpfile.seek(0)
+ inpfileIter = iter(inpfile)
+
+ left = None
+ for line in inpfileIter:
+ if(line[0] == 'M'):
+ left = MatchRecord.MatchRecord(line)
+ # outfile.write(str(left))
+ print >>outfile, left
+ countLines += 1
+ break;
+
+ for line in inpfileIter:
+ if(line[0] == 'M'):
+ right = MatchRecord.MatchRecord(line)
+
+ #if( countLines % 10000 == 0):
+ # sys.stderr.write("countLines=%d\n" % countLines)
+
+ (inter_run_gap_count,squeezed,x_len,y_len,x_notACGT,y_notACGT) \
+ = analyzeGap(xIdx,yIdx,left,right, outfile, maxgap, margin)
+ inter_run_gap_count_total += inter_run_gap_count
+ squeezed_total += squeezed
+ x_len_total += x_len
+ y_len_total += y_len
+ x_nonACGT_total += x_notACGT
+ y_nonACGT_total += y_notACGT
+ if(x_len == 0 and y_len == 0): closed_gap_count_total += 1
+
+ # Output the record which was possibly trimmed.
+ #outfile.write(str(right))
+ print >>outfile, right
+ countLines += 1
+ left = right
+ # end if
+ # end for
+
+ sys.stderr.write(
+ "countLines %d inter_run_gap_count %d closed_gap_count %d squeezed %d x_len %d y_len %d x_nonACGT %d y_nonACGT %d\n" %
+ (countLines,inter_run_gap_count_total,closed_gap_count_total,
+ squeezed_total,x_len_total,y_len_total,x_nonACGT_total,y_nonACGT_total))
+
+ sys.stderr.write("theIsolatedSNPcount = %d\n" % theIsolatedSNPcount)
+ sys.stderr.write("completefillednotXY = %d\n" % completefillednotXY)
+ sys.stderr.write("completefilledXnotY = %d\n" % completefilledXnotY)
+ sys.stderr.write("completefilledYnotX = %d\n" % completefilledYnotX)
+ sys.stderr.write("completefilledXandY = %d\n" % completefilledXandY)
+
+# end def
+
+# Allow each module to have its own main for testing.
+if __name__ == '__main__':
+
+ inpname = sys.argv[1]
+ outname = sys.argv[2]
+ xname = sys.argv[3]
+ yname = sys.argv[4]
+ assemblyId1 = sys.argv[5]
+ assemblyId2 = sys.argv[6]
+
+ # mismatches = checkExactMatches( x, y, inpfile)
+ # sys.stderr.write("mismatches = %d\n" % mismatches)
+
+ xIdx = IdxStore.IdxStore(xname,assemblyId1)
+ yIdx = IdxStore.IdxStore(yname,assemblyId2)
+
+ inpfile = open(inpname)
+ outfile = open(outname,"w")
+ mainLoop( inpfile, outfile, xIdx, yIdx)
+ outfile.close()
+# end if
diff --git a/atac-driver/chimera/Make.include b/atac-driver/chimera/Make.include
new file mode 100644
index 0000000..74cdd19
--- /dev/null
+++ b/atac-driver/chimera/Make.include
@@ -0,0 +1,16 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBSEQ/ :=$(realpath $/../../libseq/)/
+LIBATAC/ :=$(realpath $/../libatac/)/
+
+$/.CXX_EXES := $/happy-clones-span-clumps
+$/.CXX_SRCS := $/happy-clones-span-clumps.C
+
+$/.CLEAN :=$/*.o $/*~ $/core
+
+$/happy-clones-span-clumps: $/happy-clones-span-clumps.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBATAC/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/atac-driver/chimera/happy-clones-span-clumps.C b/atac-driver/chimera/happy-clones-span-clumps.C
new file mode 100644
index 0000000..0cbcd2b
--- /dev/null
+++ b/atac-driver/chimera/happy-clones-span-clumps.C
@@ -0,0 +1,484 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+using namespace std;
+#include <map>
+
+#include "util++.H"
+
+// Reads a clump-annotated atac file, builds a search tree of all the
+// matches in those clumps. Then reads a list of happy clones mapped
+// to the sequence, figures out what clump each read in the clone is
+// in, and reports whenever the clone spans a clump.
+
+
+// Contains a list of intervalLists, one for each clump. The
+// intervalList stores the positions of the matches in this clump.
+//
+class atacClumpCoordTreeScaffold {
+public:
+ atacClumpCoordTreeScaffold() {
+ clumpsLen = 0;
+ clumpsMax = 64;
+ clumpID = new uint32 [clumpsMax];
+ clumps = new intervalList<uint64> * [clumpsMax];
+ clumpmin = new uint32 [clumpsMax];
+ clumpmax = new uint32 [clumpsMax];
+
+ clumpconfirm = new uint32 [clumpsMax * clumpsMax];
+
+ for (uint32 i=0; i<clumpsMax * clumpsMax; i++)
+ clumpconfirm[i] = 0;
+
+ intervalsLen = 0;
+ intervalsMax = 0;
+ intervals = 0L;
+ };
+
+ ~atacClumpCoordTreeScaffold() {
+ for (uint32 i=0; i<clumpsLen; i++)
+ delete clumps[i];
+ delete [] clumpID;
+ delete [] clumps;
+ delete [] clumpmin;
+ delete [] clumpmax;
+ delete [] clumpconfirm;
+ delete [] intervals;
+ };
+
+ // Add a match to some clump.
+ //
+ void addMatch(int32 clumpid, uint32 begin, uint32 length) {
+
+ // Not in a clump, get the heck outta here!
+ //
+ if (clumpid < 0)
+ return;
+
+ // Linear search through the clumps to find the correct id, we
+ // don't expect to have many clumps per scaffold.
+ //
+ for (uint32 i=0; i<clumpsLen; i++) {
+ if (clumpID[i] == (uint32)clumpid) {
+ clumps[i]->add(begin, length);
+ if (clumpmin[i] > begin)
+ clumpmin[i] = begin;
+ if (clumpmax[i] < begin + length)
+ clumpmax[i] = begin + length;
+ return;
+ }
+ }
+
+ if (clumpsLen == clumpsMax) {
+ fprintf(stderr, "ERROR: increase clumpsMax!\n");
+ exit(1);
+ }
+
+ // Didn't add to an existing clump, so must be a new clump.
+ //
+ clumpID[clumpsLen] = clumpid;
+ clumps[clumpsLen] = new intervalList<uint64>;
+ clumps[clumpsLen]->add(begin, length);
+ clumpmin[clumpsLen] = begin;
+ clumpmax[clumpsLen] = begin + length;
+ clumpsLen++;
+ };
+
+
+ uint32 getClumpID(uint32 begin, uint32 end) {
+ uint32 clumpid = 0;
+ uint32 numhits = 0;
+
+ // We can make this much quicker if we remember the extent of
+ // each interval list.
+ //
+ // We want to allow partial matches, so check that the end is
+ // above the min, and the begin is before the max.
+ //
+ // b-------e b-----e
+ // -------clump------
+ //
+ for (uint32 i=0; i<clumpsLen; i++) {
+ if ((clumpmin[i] <= end) && (begin <= clumpmax[i])) {
+ if (clumps[i]->overlapping(begin, end, intervals, intervalsLen, intervalsMax) > 0) {
+ clumpid = clumpID[i];
+ numhits++;
+ }
+ } else {
+ // If you really want to check....
+ //if (clumps[i]->overlapping(begin, end, intervals, intervalsLen, intervalsMax) > 0)
+ // fprintf(stderr, "WARNING: Found overlapping clump outside extent!\n");
+ }
+ }
+
+ if (numhits == 0)
+ return(0);
+ if (numhits == 1)
+ return(clumpid);
+
+ //fprintf(stderr, "FOUND MORE THAN ONE CLUMP MATCHING!\n");
+ return(~uint32ZERO);
+ };
+
+
+ void sortClumps(void) {
+ uint32 ciid;
+ intervalList<uint64> *cptr;
+ uint32 cmin;
+ uint32 cmax;
+
+ uint32 i = 0;
+ uint32 j = 0;
+
+ // an insertion sort
+
+ for (i=clumpsLen; i--; ) {
+ ciid = clumpID[i];
+ cptr = clumps[i];
+ cmin = clumpmin[i];
+ cmax = clumpmax[i];
+
+ for (j=i+1; (j < clumpsLen) && (cmin > clumpmin[j]); j++) {
+ clumpID[j-1] = clumpID[j];
+ clumps[j-1] = clumps[j];
+ clumpmin[j-1] = clumpmin[j];
+ clumpmax[j-1] = clumpmax[j];
+ }
+
+ clumpID[j-1] = ciid;
+ clumps[j-1] = cptr;
+ clumpmin[j-1] = cmin;
+ clumpmax[j-1] = cmax;
+ }
+ };
+
+
+ void confirm(uint32 ca, uint32 cb) {
+ uint32 caidx = 0;
+ uint32 cbidx = 0;
+ for (uint32 i=0; i<clumpsLen; i++) {
+ if (ca == clumpID[i])
+ caidx = i;
+ if (cb == clumpID[i])
+ cbidx = i;
+ }
+ clumpconfirm[caidx * clumpsMax + cbidx]++;
+ };
+
+
+ uint32 clumpsLen;
+ uint32 clumpsMax;
+ uint32 *clumpID;
+ intervalList<uint64> **clumps;
+ uint32 *clumpmin;
+ uint32 *clumpmax;
+
+ uint32 *clumpconfirm;
+
+ uint32 intervalsLen;
+ uint32 intervalsMax;
+ uint32 *intervals;
+};
+
+
+class atacClumpCoordTree {
+public:
+ atacClumpCoordTree() {
+ scaffoldsMax = 262144;
+ scaffolds = new atacClumpCoordTreeScaffold * [scaffoldsMax];
+ for (uint32 i=0; i<scaffoldsMax; i++)
+ scaffolds[i] = 0L;
+ };
+ ~atacClumpCoordTree() {
+ for (uint32 i=0; i<scaffoldsMax; i++)
+ if (scaffolds[i])
+ delete scaffolds[i];
+ delete [] scaffolds;
+ };
+
+
+ void addMatch(uint32 scaffoldid, int32 clumpid, uint32 begin, uint32 length) {
+ if (scaffoldid >= scaffoldsMax) {
+ fprintf(stderr, "ERROR: increase scaffoldsMax "uint32FMT"\n", scaffoldid);
+ exit(1);
+ }
+
+ if (scaffolds[scaffoldid] == 0L)
+ scaffolds[scaffoldid] = new atacClumpCoordTreeScaffold;
+
+ scaffolds[scaffoldid]->addMatch(clumpid, begin, length);
+ };
+
+
+ void removeSingleClumpScaffolds(void) {
+ uint32 deleted = 0;
+ uint32 remain = 0;
+
+
+ for (uint32 i=0; i<scaffoldsMax; i++) {
+ if ((scaffolds[i]) && (scaffolds[i]->clumpsLen < 2)) {
+ delete scaffolds[i];
+ scaffolds[i] = 0L;
+ deleted++;
+ }
+ if (scaffolds[i]) {
+ scaffolds[i]->sortClumps();
+ remain++;
+ }
+ }
+ fprintf(stderr, "Deleted "uint32FMT" scaffolds with less than 2 clumps.\n", deleted);
+ fprintf(stderr, "Remain "uint32FMT" scaffolds with more than 2 clumps.\n", remain);
+ };
+
+
+ void showMultipleClumpScaffolds(void) {
+
+ for (uint32 i=0; i<scaffoldsMax; i++) {
+ if ((scaffolds[i]) && (scaffolds[i]->clumpsLen >= 2)) {
+
+ fprintf(stdout, "\n");
+
+ for (uint32 j=0; j<scaffolds[i]->clumpsLen; j++) {
+ bool overlap = false;
+
+ if ((j+1 < scaffolds[i]->clumpsLen) &&
+ (scaffolds[i]->clumpmax[j] > scaffolds[i]->clumpmin[j+1]))
+ overlap = true;
+
+ fprintf(stdout, "scaffold "uint32FMT" clump "uint32FMT" begin "uint32FMT" end "uint32FMT"\n",
+ i,
+ scaffolds[i]->clumpID[j],
+ scaffolds[i]->clumpmin[j],
+ scaffolds[i]->clumpmax[j]);
+
+ if (overlap)
+ fprintf(stdout, "scaffold "uint32FMT" clump "uint32FMT" and clump "uint32FMT" OVERLAP\n",
+ i,
+ scaffolds[i]->clumpID[j],
+ scaffolds[i]->clumpID[j+1]);
+
+ for (uint32 b=0; b<scaffolds[i]->clumpsLen; b++) {
+ uint32 cc = j * scaffolds[i]->clumpsMax + b;
+ if (scaffolds[i]->clumpconfirm[cc]) {
+ fprintf(stdout, "scaffold "uint32FMT" clump "uint32FMT" and "uint32FMT" confirmed by "uint32FMT" clones.\n",
+ i,
+ scaffolds[i]->clumpID[j],
+ scaffolds[i]->clumpID[b],
+ scaffolds[i]->clumpconfirm[cc]);
+
+ }
+ }
+ }
+ }
+ }
+ };
+
+
+ uint32 getClumpID(uint32 scaffoldid, uint32 begin, uint32 end) {
+ if (scaffolds[scaffoldid])
+ return(scaffolds[scaffoldid]->getClumpID(begin, end));
+ return(0);
+ };
+
+ void confirmClump(uint32 scaffoldid, uint32 ca, uint32 cb) {
+ if (scaffolds[scaffoldid]) {
+ scaffolds[scaffoldid]->confirm(ca, cb);
+ }
+ };
+
+
+ uint32 scaffoldsMax;
+ atacClumpCoordTreeScaffold **scaffolds;
+};
+
+
+
+
+
+atacClumpCoordTree*
+buildCoordTree(char *clumpFile) {
+ atacClumpCoordTree *ct = new atacClumpCoordTree;
+
+ FILE *inf;
+ char inl[1024];
+
+ // We can't use the built-in atac reader, because it strips out
+ // clump information. Bummer.
+
+ errno = 0;
+ inf = fopen(clumpFile, "r");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", clumpFile, strerror(errno)), exit(1);
+
+ fgets(inl, 1024, inf);
+
+ if (feof(inf))
+ return(0L);
+
+ while (!feof(inf)) {
+ if ((inl[0] == 'M') &&
+ (inl[2] == 'u')) {
+ splitToWords S(inl);
+
+ //fprintf(stderr, "%s", inl);
+
+ if (S[12][0] != '#')
+ fprintf(stderr, "no clump for '%s'\n", inl);
+
+ if (S[13][0] != '-') {
+ char *scfid = S[8];
+ while (*scfid != ':')
+ scfid++;
+
+ ct->addMatch(atoi(scfid + 1),
+ atoi(S[13]),
+ atoi(S[9]),
+ atoi(S[10]));
+ }
+ }
+
+ fgets(inl, 1024, inf);
+ }
+
+ fclose(inf);
+
+ return(ct);
+}
+
+
+
+
+
+
+
+
+int
+main(int argc, char **argv) {
+ char *clumpFile = 0L;
+ char *happyFile = 0L;
+
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-clumps") == 0) {
+ clumpFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-happy") == 0) {
+ happyFile = argv[++arg];
+ } else {
+ err++;
+ }
+
+ arg++;
+ }
+
+ if (clumpFile == 0L)
+ fprintf(stderr, "No -clumps supplied!\n"), err++;
+ if (happyFile == 0L)
+ fprintf(stderr, "No -happy clones supplied!\n"), err++;
+ if (err)
+ fprintf(stderr, "usage: %s ...\n", argv[0]), exit(1);
+
+ atacClumpCoordTree *ct = buildCoordTree(clumpFile);
+
+ ct->removeSingleClumpScaffolds();
+ ct->showMultipleClumpScaffolds();
+
+ ////////////////////////////////////////
+ //
+ // ugly hack -- read in the map from HUREF6A UID to scaffold
+ //
+ map<uint64,uint32> UIDtoIID;
+
+ {
+ char *uidmapName = "/project/huref6/assembly/fasta/HUREF6A.info";
+
+ errno = 0;
+ FILE *F = fopen(uidmapName, "r");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", uidmapName, strerror(errno)), exit(1);
+
+ char L[1024];
+
+ fgets(L, 1024, F);
+ while (!feof(F)) {
+ if (L[0] == 'G') {
+ splitToWords S(L);
+
+ UIDtoIID[strtouint64(S[13]+1, 0L)] = strtouint32(S[10], 0L);
+ }
+ fgets(L, 1024, F);
+ }
+ }
+ //
+ ////////////////////////////////////////
+
+ FILE *inf;
+ char ina[1024];
+ char inb[1024];
+
+ speedCounter S("%9.0f clones (%6.1f clones/sec)\r", 1, 4096, true);
+
+ errno = 0;
+ inf = fopen(happyFile, "r");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", happyFile, strerror(errno)), exit(1);
+
+ fgets(ina, 1024, inf); chomp(ina);
+ fgets(inb, 1024, inf); chomp(inb);
+ while (!feof(inf)) {
+ splitToWords A(ina);
+ splitToWords B(inb);
+
+ // Some sanity checking.
+ if (strcmp(A[2], B[2]) != 0) {
+ fprintf(stderr, "ERROR: Different clone!\n%s\n%s\n", ina, inb);
+ }
+
+
+ uint32 scfa = UIDtoIID[strtouint64(A[7], 0L)];
+ uint32 scfb = UIDtoIID[strtouint64(B[7], 0L)];
+
+ uint32 cla = ct->getClumpID(scfa,
+ atoi(A[8]),
+ atoi(A[9]));
+ uint32 clb = ct->getClumpID(scfb,
+ atoi(B[8]),
+ atoi(B[9]));
+
+ if (cla == ~uint32ZERO) {
+ fprintf(stdout, "%s spans clump in scaffold %s,"uint32FMT"\n", ina, A[7], scfa);
+ cla = 0;
+ }
+ if (clb == ~uint32ZERO) {
+ fprintf(stdout, "%s spans clump in scaffold %s,"uint32FMT" \n", inb, B[7], scfb);
+ clb = 0;
+ }
+
+ if ((cla != 0) &&
+ (clb != 0) &&
+ (cla != clb)) {
+ ct->confirmClump(scfa,
+ (cla < clb) ? cla : clb,
+ (cla < clb) ? clb : cla);
+
+ fprintf(stdout, "scaffold %s,"uint32FMT" clump "uint32FMT" "uint32FMT" confirmed by %s\n",
+ A[7], scfa,
+ (cla < clb) ? cla : clb,
+ (cla < clb) ? clb : cla,
+ A[2]);
+ }
+
+ S.tick();
+
+ fgets(ina, 1024, inf); chomp(ina);
+ fgets(inb, 1024, inf); chomp(inb);
+ }
+
+ fclose(inf);
+
+ S.finish();
+
+ ct->showMultipleClumpScaffolds();
+
+ delete ct;
+}
diff --git a/atac-driver/chimera/use-clumps-to-detect-chimera.pl b/atac-driver/chimera/use-clumps-to-detect-chimera.pl
new file mode 100644
index 0000000..0e19f2b
--- /dev/null
+++ b/atac-driver/chimera/use-clumps-to-detect-chimera.pl
@@ -0,0 +1,138 @@
+#!/usr/bin/perl
+
+# Takes a path to a properly formatted atac file, uses that file to
+# detect potential chimeric scaffolds.
+
+use strict;
+
+my $atacFile = undef;
+my $reference = "A";
+my $noiseLevel = 1;
+
+while (scalar(@ARGV)) {
+ my $arg = shift @ARGV;
+
+ if ($arg eq "-A") {
+ $reference = "A";
+ } elsif ($arg eq "-B") {
+ $reference = "B";
+ } elsif ($arg eq "-n") {
+ $noiseLevel = shift @ARGV;
+ } elsif (-e $arg) {
+ $atacFile = $arg;
+ } else {
+ print STDERR "Unknown option (or input file) '$arg'\n";
+ }
+}
+
+if (! -e "$atacFile") {
+ print STDERR "usage: $0 [-A | -B] file.atac\n";
+ print STDERR " -A use the first assembly as the reference (default)\n";
+ print STDERR " -B use the second assembly as the reference\n";
+ exit(1);
+}
+
+open(ATAC, "< $atacFile") or die;
+my @ATAC = <ATAC>;
+chomp @ATAC;
+close(ATAC);
+
+my %ATACtoUID;
+
+foreach my $line (@ATAC) {
+ if ($line =~ m/assemblyFile(\d)=(.*)$/) {
+ chomp $line;
+
+ my $sequenceFile;
+
+ if (($1 == 1) && ($reference eq "B")) {
+ $sequenceFile = $2;
+ }
+ if (($1 == 2) && ($reference eq "A")) {
+ $sequenceFile = $2;
+ }
+
+ # If not defined, we don't need to read in these ID's.
+
+ if (defined($sequenceFile)) {
+ $sequenceFile =~ s/.fasta/.info/;
+
+ die "Failed to find info on '$sequenceFile'\n" if (! -e $sequenceFile);
+
+ print STDERR "Reading ATAC to UID map for '$sequenceFile'\n";
+
+ open(F, "< $sequenceFile");
+ while (<F>) {
+ if (m/^G/) {
+ my @vals = split '\s+', $_;
+ $ATACtoUID{$vals[2]} = $vals[13];
+ }
+ }
+ close(F);
+ }
+ }
+}
+
+
+# 0 1 2 3 4 5 6 7 8 9 10 11 12 13
+# M u H4467431a11 r1 B35LC:0 56097 66 1 HUREF4:36734 812 66 -1 # 10867
+# M u H4467431a10 r1 B35LC:0 56163 29 1 HUREF4:36734 782 29 -1 # 10867
+
+# Note that our match below does not match the non-clump marker "-1"
+
+
+# Find the scaffolds with errors
+#
+# Save the clump id for the first instance of every scaffold. If
+# we've seen the scaffold before, and the clump id is now different,
+# remember this scaffold.
+
+my %scaffold;
+my %errors;
+
+foreach (@ATAC) {
+ if (m/^M\su\s.*\s#\s(\d+)$/) {
+ my @v = split '\s+', $_;
+
+ if (!defined($scaffold{$v[8]})) {
+ $scaffold{$v[8]} = $v[13];
+ } elsif ($scaffold{$v[8]} ne $v[13]) {
+ $errors{$v[8]}++;
+ }
+ }
+}
+
+# Print them
+#
+# Go through the map again, remembering the number of times we see a
+# scaffold/clump pair. It's also useful to remember the sum of the
+# lengths for this pair, and the chromosome it maps to.
+
+my %counts;
+my %length;
+my %chrid;
+
+foreach (@ATAC) {
+ if (m/^M\su\s.*\s#\s(\d+)$/) {
+ my @v = split '\s+', $_;
+
+ if (defined($errors{$v[8]})) {
+ my $string = "$v[8]\t$ATACtoUID{$v[8]}\t$v[13]";
+ $counts{$string}++;
+ $length{$string} += $v[10];
+ $chrid{$string} = $v[4];
+ }
+ }
+}
+
+# We could provide a raw dump of this data, but we'd like
+# to first denoise it. A very simple denoser works - just
+# don't report anything with one match.
+
+open(F, "| sort -k3,3");
+foreach my $s (keys %counts) {
+ if ($counts{$s} > $noiseLevel) {
+ print F "$counts{$s}\t$length{$s}\t$s\t$chrid{$s}\n";
+ }
+}
+close(F);
diff --git a/atac-driver/clumpMaker/Make.include b/atac-driver/clumpMaker/Make.include
new file mode 100644
index 0000000..c02e7d6
--- /dev/null
+++ b/atac-driver/clumpMaker/Make.include
@@ -0,0 +1,16 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBSEQ/ :=$(realpath $/../../libseq/)/
+LIBATAC/ :=$(realpath $/../libatac/)/
+
+$/.CXX_EXES := $/clumpMaker
+$/.CXX_SRCS := $/clumpMaker.C
+
+$/.CLEAN :=$/*.o $/*~ $/core
+
+$/clumpMaker: $/clumpMaker.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBATAC/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/atac-driver/clumpMaker/clumpMaker.C b/atac-driver/clumpMaker/clumpMaker.C
new file mode 100644
index 0000000..5fbafdf
--- /dev/null
+++ b/atac-driver/clumpMaker/clumpMaker.C
@@ -0,0 +1,385 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util.h"
+#include "atac.H"
+
+// Aaron Halpern's clumpMaker algorithm.
+//
+// To reproduce the original clumpMaker exactly, assuming that your
+// atac mapping is for a QUERYvsREFERENCE:
+//
+// in=VISD6vsB35LC/VISD6vsB35LC
+// cut -d' ' -f 1-12 < $in.atac.ckpLast | grep "^M u" | sort -k5,5 -k6n > tmp.a.clumpMaker
+// $clumpMaker -S -c 50000 -2 -f tmp.a.clumpMaker > $in.50000.clumps
+//
+// That is, use only the first 12 columns of info, only ungapped
+// matches (the original also allows gapped matches, but we don't
+// have any of those), then sort by the QUERY iid and position. Yes,
+// the sort is supposed to be alphanumeric.
+//
+// Then, run clumpMaker DISABLING it's sort (which sorts iids
+// numerically), using the second sequence as the reference.
+//
+
+class tClumpHit {
+public:
+
+ void set(atacMatch *m, bool seq1IsRef) {
+
+ match = *m;
+ matchIID = m->matchiid;
+
+ if (seq1IsRef) {
+ refIID = m->iid1;
+ refBeg = m->pos1;
+ refEnd = m->pos1 + m->len1;
+ qryIID = m->iid2;
+ qryBeg = m->pos2;
+ qryEnd = m->pos2 + m->len2;
+ } else {
+ refIID = m->iid2;
+ refBeg = m->pos2;
+ refEnd = m->pos2 + m->len2;
+ qryIID = m->iid1;
+ qryBeg = m->pos1;
+ qryEnd = m->pos1 + m->len1;
+ }
+
+ ori = m->fwd2 ? 1 : -1;
+
+ bestStart = -1;
+ bestExtend = -1;
+ scoreStart = 0;
+ scoreExtend = 0;
+ clump = -1;
+ };
+
+ int64 get_bestScore() const {
+ return(max(scoreStart, scoreExtend));
+ };
+
+ atacMatch match;
+
+ uint32 matchIID;
+
+ uint32 refIID;
+ int32 refBeg;
+ int32 refEnd;
+
+ uint32 qryIID;
+ int32 qryBeg;
+ int32 qryEnd;
+
+ int32 ori;
+
+ int32 scoreStart;
+ int32 bestStart;
+ int32 scoreExtend;
+ int32 bestExtend;
+ int32 clump;
+};
+
+
+
+int
+clumpHitCompareQry(const void *A, const void *B) {
+ const tClumpHit *a = (const tClumpHit *)A;
+ const tClumpHit *b = (const tClumpHit *)B;
+
+ if (a->qryIID > b->qryIID) return(1);
+ if (a->qryIID < b->qryIID) return(-1);
+ if (a->qryBeg > b->qryBeg) return(1);
+ if (a->qryBeg < b->qryBeg) return(-1);
+ if (a->qryEnd > b->qryEnd) return(1);
+ if (a->qryEnd < b->qryEnd) return(-1);
+
+ if (a->refIID > b->refIID) return(1);
+ if (a->refIID < b->refIID) return(-1);
+ if (a->refBeg > b->refBeg) return(1);
+ if (a->refBeg < b->refBeg) return(-1);
+ if (a->refEnd > b->refEnd) return(1);
+ if (a->refEnd < b->refEnd) return(-1);
+
+ return(0);
+}
+
+int
+clumpHitCompareIID(const void *A, const void *B) {
+ const tClumpHit *a = (const tClumpHit *)A;
+ const tClumpHit *b = (const tClumpHit *)B;
+
+ if (a->matchIID > b->matchIID) return(1);
+ if (a->matchIID < b->matchIID) return(-1);
+ return(0);
+}
+
+
+bool
+chainable(tClumpHit *a, tClumpHit *b, int32 maxjump) {
+
+ // return false if
+ // hits are to different chromosomes
+ // hits are not similarly oriented
+ // hits are too far apart on query axis
+ // hits are too far apart on reference axis
+ // hits are "out of order" (we're sorted by the qry)
+ //
+ return(!((a->refIID != b->refIID) || (a->qryIID != b->qryIID) ||
+ (a->ori != b->ori) ||
+ (b->qryBeg - a->qryEnd > maxjump) ||
+ (a->ori * (b->refBeg - a->refEnd) > maxjump) ||
+ (a->ori * (b->refBeg - a->refBeg) < 0)));
+}
+
+
+int32
+score_all_hits(tClumpHit *hits,
+ int32 clumpcost,
+ int32 maxjump,
+ uint32 num_hits){
+
+ // location of best score so far (to which we point whenever starting a new clump)
+ int32 bestEnd = -1;
+
+
+ // best scores so far internal to a reference unit (scaffold, chromosome, etc)
+ int32 bestEndThis = -1;
+ int32 bestScoreThis = -clumpcost;
+
+ // furthest back still accessible ...
+ uint32 furthest_back=0;
+
+ for(uint32 i=0; i<num_hits; i++) {
+
+ if ((i==0) || (hits[i].qryIID != hits[i-1].qryIID)) {
+ bestEnd = bestEndThis; // best of previous query unit
+ bestEndThis = i-1;
+ bestScoreThis = -clumpcost;
+ }
+
+ // find best way of using this as start of a new clump
+ if ((bestEndThis >= 0) &&
+ (bestScoreThis >= 0)) {
+ // start new clump that is not the first for this reference unit
+ hits[i].scoreStart = hits[i].qryEnd - hits[i].qryBeg + bestScoreThis - clumpcost;
+ hits[i].bestStart = bestEndThis;
+ } else {
+ // clump would be first (to be used) for this reference unit
+ hits[i].scoreStart = hits[i].qryEnd - hits[i].qryBeg - clumpcost;
+ hits[i].bestStart = bestEnd;
+ }
+
+ // find best way of extending a clump, if any
+ if (furthest_back < i) {
+ int32 cutoff = hits[i].qryBeg - maxjump;
+
+ while ((hits[furthest_back].qryIID != hits[i].qryIID) ||
+ (hits[furthest_back].qryEnd < cutoff))
+ furthest_back++;
+ }
+
+ int32 extendScore = -clumpcost;
+ int32 extendprev = -1;
+ for (uint32 j=furthest_back; j<i; j++) {
+ if (chainable(hits+j, hits+i, maxjump)) {
+ int32 tmpscore = hits[j].get_bestScore() + hits[i].qryEnd - hits[i].qryBeg;
+ if(extendScore < tmpscore){
+ extendScore=tmpscore;
+ extendprev=j;
+ }
+ }
+ }
+ hits[i].scoreExtend = extendScore;
+ hits[i].bestExtend = extendprev;
+
+ //figure out whether this is a new best ...
+ int32 tmpscore = hits[i].get_bestScore();
+ if (tmpscore > bestScoreThis) {
+ bestScoreThis = tmpscore;
+ bestEndThis = i;
+ }
+ }
+
+ return(bestEndThis);
+}
+
+
+
+
+
+
+
+
+int
+main(int argc, char **argv) {
+ int32 clumpcost = 50000;
+ int32 maxjump = 200000;
+ bool seq1IsRef = false;
+ char *filename = 0L;
+ bool isSorted = false;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-c") == 0) {
+ clumpcost = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-j") == 0) {
+ maxjump = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-1") == 0) {
+ seq1IsRef = true;
+ } else if (strcmp(argv[arg], "-2") == 0) {
+ seq1IsRef = false;
+ } else if (strcmp(argv[arg], "-f") == 0) {
+ filename = argv[++arg];
+ } else if (strcmp(argv[arg], "-S") == 0) {
+ isSorted = true;
+ } else {
+ fprintf(stderr, "Unknown argument '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ if (filename == 0L) {
+ fprintf(stderr, "usage: %s [] -f filename\n", argv[0]);
+ fprintf(stderr, " -c x penalty for clump start, default 50000\n");
+ fprintf(stderr, " -j x max jump between consistent hits in a clump, default 200000\n");
+ fprintf(stderr, " -1 the reference assembly is the first one.\n");
+ fprintf(stderr, " -2 the reference assembly is the second one (default).\n");
+ fprintf(stderr, " -S assume the input is already sorted by the query IID, position.\n");
+ fprintf(stderr, " this will also make the output sorted by queryIID, queryPosition\n");
+ exit(1);
+ }
+
+
+
+
+ fprintf(stderr, "1 load the matches\n");
+
+ atacFile *AF = new atacFile(filename);
+ uint32 hitsLen = AF->matches()->numberOfMatches();
+ tClumpHit *hits = new tClumpHit [hitsLen];
+
+ for (uint32 i=0; i<hitsLen; i++)
+ hits[i].set(AF->matches()->getMatch(i), seq1IsRef);
+
+
+
+ fprintf(stderr, "2 sort the matches\n");
+ qsort(hits, hitsLen, sizeof(tClumpHit), clumpHitCompareQry);
+
+
+
+ fprintf(stderr, "3 score the matches\n");
+ int32 bestEnd = score_all_hits(hits,
+ clumpcost,
+ maxjump,
+ hitsLen);
+
+ // Mark the clumps
+ //
+ fprintf(stderr, "4 mark clumps\n");
+ uint32 clump = 0;
+
+ while(bestEnd >= 0) {
+ hits[bestEnd].clump = clump;
+
+ if (hits[bestEnd].scoreExtend > hits[bestEnd].scoreStart) {
+ bestEnd = hits[bestEnd].bestExtend;
+ } else {
+ bestEnd = hits[bestEnd].bestStart;
+ clump++;
+ }
+ }
+
+ // Sort the hits by iid, then merge into the output
+ //
+ fprintf(stderr, "5 sort the matches\n");
+ qsort(hits, hitsLen, sizeof(tClumpHit), clumpHitCompareIID);
+
+
+ // For each clump, find the min/max extent in both sequences. We
+ // use this to output the clump match record.
+ //
+ int32 *clumpLoRef = new int32 [clump];
+ int32 *clumpHiRef = new int32 [clump];
+ int32 *clumpLoQry = new int32 [clump];
+ int32 *clumpHiQry = new int32 [clump];
+ bool *clumpOut = new bool [clump];
+
+ for (uint32 xx=0; xx<clump; xx++) {
+ clumpLoRef[xx] = 1000000000;
+ clumpHiRef[xx] = 0;
+ clumpLoQry[xx] = 1000000000;
+ clumpHiQry[xx] = 0;
+ clumpOut[xx] = false;
+ }
+
+ for (uint32 xx=0; xx<hitsLen; xx++) {
+ int32 cc = hits[xx].clump;
+ if (cc >= 0) {
+ if (hits[xx].refBeg < clumpLoRef[cc]) clumpLoRef[cc] = hits[xx].refBeg;
+ if (hits[xx].refEnd > clumpHiRef[cc]) clumpHiRef[cc] = hits[xx].refEnd;
+
+ if (hits[xx].qryBeg < clumpLoQry[cc]) clumpLoQry[cc] = hits[xx].qryBeg;
+ if (hits[xx].qryEnd > clumpHiQry[cc]) clumpHiQry[cc] = hits[xx].qryEnd;
+ }
+ }
+
+ // Dump the clumps
+ //
+
+ fprintf(stderr, "6 output matches with clumps\n");
+
+ AF->writeHeader(stdout);
+
+ for (uint32 mm=0; mm<hitsLen; mm++) {
+ int32 cc = hits[mm].clump;
+
+ if ((cc >= 0) &&
+ (clumpOut[cc] == false)) {
+ atacMatch C;
+ sprintf(C.matchuid, "clump"int32FMTW(06), cc);
+ sprintf(C.parentuid, ".");
+ C.matchiid = 0;
+ C.type[0] = 'c';
+ C.type[1] = 0;
+
+ C.iid1 = hits[mm].match.iid1;
+ C.iid2 = hits[mm].match.iid2;
+
+ // Set the position and length based on the correct reference
+ // -- in particular, since we get the IID and orientation from
+ // the copy of the match we don't need to listen to the
+ // seq1IsRef flag for those.
+
+ if (seq1IsRef) {
+ C.pos1 = clumpLoRef[cc];
+ C.len1 = clumpHiRef[cc] - clumpLoRef[cc];
+ C.pos2 = clumpLoQry[cc];
+ C.len2 = clumpHiQry[cc] - clumpLoQry[cc];
+ } else {
+ C.pos1 = clumpLoQry[cc];
+ C.len1 = clumpHiQry[cc] - clumpLoQry[cc];
+ C.pos2 = clumpLoRef[cc];
+ C.len2 = clumpHiRef[cc] - clumpLoRef[cc];
+ }
+
+ C.fwd1 = hits[mm].match.fwd1;
+ C.fwd2 = hits[mm].match.fwd2;
+
+ C.print(stdout, AF->labelA(), AF->labelB());
+
+ clumpOut[cc] = true;
+ }
+
+
+ if (cc >= 0)
+ sprintf(hits[mm].match.parentuid, "clump"int32FMTW(06), cc);
+ else
+ sprintf(hits[mm].match.parentuid, ".");
+
+ hits[mm].match.print(stdout, AF->labelA(), AF->labelB());
+ }
+
+ return(0);
+}
diff --git a/atac-driver/config.py b/atac-driver/config.py
new file mode 100644
index 0000000..9f9bea4
--- /dev/null
+++ b/atac-driver/config.py
@@ -0,0 +1,11 @@
+#!/usr/local/packages/python-2.7.3/bin/python2.7
+
+import sys
+import os
+import getopt
+from distutils import sysconfig
+
+print sysconfig.get_python_inc()
+
+# flags = ['-I' + ,
+# '-I' + sysconfig.get_python_inc(plat_specific=True)]
diff --git a/atac-driver/gapShifter/Make.include b/atac-driver/gapShifter/Make.include
new file mode 100644
index 0000000..86e8788
--- /dev/null
+++ b/atac-driver/gapShifter/Make.include
@@ -0,0 +1,30 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBSEQ/ :=$(realpath $/../../libseq/)/
+LIBATAC/ :=$(realpath $/../libatac/)/
+
+$/.CXX_SRCS := $/gapShifter.C $/extractSequence.C $/extractUnmapped.C $/coalesceMatches.C $/correctGaps.C $/testAtac.C $/cleanAtac.C $/projectFeatures.C
+$/.CXX_EXES := $/gapShifter $/extractSequence $/extractUnmapped $/coalesceMatches $/correctGaps $/testAtac $/cleanAtac $/projectFeatures
+
+$/.CLEAN :=$/*.o $/*~ $/core
+
+$/gapShifter: $/gapShifter.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/extractUnmapped: $/extractUnmapped.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/extractSequence: $/extractSequence.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/coalesceMatches: $/coalesceMatches.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/correctGaps: $/correctGaps.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/testAtac: $/testAtac.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/cleanAtac: $/cleanAtac.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/projectFeatures: $/projectFeatures.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBATAC/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/atac-driver/gapShifter/alignUnmapped.C b/atac-driver/gapShifter/alignUnmapped.C
new file mode 100644
index 0000000..dfd47dd
--- /dev/null
+++ b/atac-driver/gapShifter/alignUnmapped.C
@@ -0,0 +1,176 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+
+// Attempts to align unmapped regions.
+//
+// For each unmapped region, we extract the corresponding sequences,
+// sim4db them together, parse the output to make atac-format
+// matches, but of a lower confidence.
+//
+// IDX1 ------------------------------------------
+// |||||||| ||||||||||
+// IDX2 -A------------\ /--------------B-
+// \ /
+//
+// The nasty case is that IDX1 could be doubly mapped, once by A and
+// once by B. So we also need to label those regions that are mapped
+// multiple times as an even lower confidence.
+//
+// We probably should bias the alignment towards the anchored edge,
+// implying I should use something other than sim4db here.
+//
+// We end up with three confidence classes:
+// 1) mapped by atac itself, 1-to-1 matches
+// 2) mapped by sim4db above, with no conflict, between anchors
+// 3) same as 2, but conflicting
+//
+// Why sim4db? It's splicing model might introduce some noise on the
+// ends (which we'll clean up), but more importantly, the splicing
+// allows us to skip over large blocks of whatever (rearrangement,
+// tandem repeat, etc). And it's also in my source tree and I know
+// how to use it.
+
+
+this is unfinished crap
+
+
+
+// The below is the main from writing unmatched regions
+
+
+
+
+int
+main(int argc, char *argv[]) {
+ FILE *Aoutput = 0L;
+ FILE *Boutput = 0L;
+ char *matchesFile = 0L;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-a") == 0) {
+ errno = 0;
+ Aoutput = fopen(argv[++arg], "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", argv[arg], strerror(errno)), exit(1);
+ } else if (strcmp(argv[arg], "-b") == 0) {
+ errno = 0;
+ Boutput = fopen(argv[++arg], "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", argv[arg], strerror(errno)), exit(1);
+ } else if (strcmp(argv[arg], "-m") == 0) {
+ matchesFile = argv[++arg];
+ } else {
+ fprintf(stderr, "usage: %s -a Aunmatched.fasta -b B.unmatched.fasta < matches\n", argv[0]);
+ exit(1);
+ }
+ arg++;
+ }
+
+ if ((Aoutput == 0L) || (Boutput == 0L) || (matchesFile == 0L)) {
+ fprintf(stderr, "usage: %s -a Aunmatched.fasta -b B.unmatched.fasta < matches\n", argv[0]);
+ exit(1);
+ }
+
+ atacMatchList ML1(matchesFile, 'm', false);
+ atacMatchList ML2(matchesFile, 'm', false);
+
+ ML1.sort1(); // Sorted by first index
+ ML2.sort2(); // Sorted by second index
+
+ seqCache *W1 = ML1._seq1;
+ seqCache *W2 = ML1._seq2;
+
+
+ // For every match,
+ // find the match before and the match after, on both axes
+ //
+
+
+
+
+ // Extract unmapped in sequence 1
+
+ ML.sort1();
+ W = ML._seq1;
+ W->find(ML[0]->iid1);
+ S = W->getSequenceInCore();
+ for (uint32 i=1; i<ML.numMatches(); i++) {
+ atacMatch *l = ML[i-1];
+ atacMatch *r = ML[i];
+
+ if (l->iid1 != r->iid1)
+ continue;
+
+ if (l->iid1 != S->getIID()) {
+ delete S;
+ W->find(l->iid1);
+ S = W->getSequenceInCore();
+ }
+
+ // Extract from (l->pos1 + l->len1) to (r->pos1), if it's longer than 20bp
+ //
+ if (l->pos1 + l->len1 + 20 < r->pos1)
+ writeGaplessSequence(Aoutput,
+ S,
+ l->pos1 + l->len1,
+ r->pos1);
+ }
+
+ // Extract unmapped in sequence 2
+
+ ML.sort2();
+ W = ML._seq2;
+ W->find(ML[0]->iid2);
+ S = W->getSequenceInCore();
+ for (uint32 i=1; i<ML.numMatches(); i++) {
+ atacMatch *l = ML[i-1];
+ atacMatch *r = ML[i];
+
+ if (l->iid2 != r->iid2)
+ continue;
+
+ if (l->iid2 != S->getIID()) {
+ delete S;
+ W->find(l->iid2);
+ S = W->getSequenceInCore();
+ }
+
+ // Extract from (l->pos2 + l->len2) to (r->pos2), if it's longer than 20bp
+ //
+ if (l->pos2 + l->len2 + 20 < r->pos2)
+ writeGaplessSequence(Boutput,
+ S,
+ l->pos2 + l->len2,
+ r->pos2);
+ }
+
+
+ fclose(Aoutput);
+ fclose(Boutput);
+
+ return(0);
+}
diff --git a/atac-driver/gapShifter/cleanAtac.C b/atac-driver/gapShifter/cleanAtac.C
new file mode 100644
index 0000000..91b5a6c
--- /dev/null
+++ b/atac-driver/gapShifter/cleanAtac.C
@@ -0,0 +1,168 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+// Reads a set of atac matches, trims off ends that are mismatch.
+// Computes the percent identity of the resulting match.
+// Outputs the trimmed match if it is above some percent identity.
+
+void
+usage(char *name) {
+ fprintf(stderr, "usage: %s [-d identity] [-i identity] -m matches\n", name);
+ fprintf(stderr, " -d discard the match if it is below this percent identity\n");
+}
+
+int
+main(int argc, char *argv[]) {
+ char *matchesFile = 0L;
+ double discardThreshold = 0.0;
+ uint32 discardLength = 0;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-m") == 0) {
+ matchesFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-d") == 0) {
+ discardThreshold = atof(argv[++arg]);
+ if (discardThreshold > 1.0)
+ discardThreshold /= 100;
+ } else if (strcmp(argv[arg], "-l") == 0) {
+ discardLength = atoi(argv[++arg]);
+ } else {
+ usage(argv[0]);
+ exit(1);
+ }
+ arg++;
+ }
+
+ if (matchesFile == 0L)
+ usage(argv[0]), exit(1);
+
+ atacFile AF(matchesFile);
+ atacMatchList &ML = *AF.matches();
+ seqCache Acache(AF.assemblyFileA(), 32, false);
+ seqCache Bcache(AF.assemblyFileB(), 32, false);
+
+ for (uint32 i=0; i<ML.numMatches(); i++) {
+ atacMatch *m = ML.getMatch(i);
+
+ uint32 identities = 0;
+
+ //char *a = Acache.getSequenceInCore(m->iid1)->sequence() + m->pos1;
+ //char *b = Bcache.getSequenceInCore(m->iid2)->sequence() + m->pos2;
+ //uint32 p, q;
+
+
+ // Trim the match
+ //
+ if (m->fwd2) {
+ char *a = Acache.getSequenceInCore(m->iid1)->sequence() + m->pos1;
+ char *b = Bcache.getSequenceInCore(m->iid2)->sequence() + m->pos2;
+ uint32 p = 0;
+
+ while ((m->len1 > 0) && (toUpper[(int)a[p]] != toUpper[(int)b[p]])) {
+ m->pos1++;
+ m->pos2++;
+ m->len1--;
+ m->len2--;
+ p++;
+ }
+
+ a = Acache.getSequenceInCore(m->iid1)->sequence() + m->pos1;
+ b = Bcache.getSequenceInCore(m->iid2)->sequence() + m->pos2;
+ p = m->len1-1;
+ while ((m->len1 > 0) && (toUpper[(int)a[p]] != toUpper[(int)b[p]])) {
+ m->len1--;
+ m->len2--;
+ p--;
+ }
+
+ } else {
+ char *a = Acache.getSequenceInCore(m->iid1)->sequence() + m->pos1;
+ char *b = Bcache.getSequenceInCore(m->iid2)->sequence() + m->pos2;
+ uint32 p = 0;
+ uint32 q = m->len2 - 1;
+
+ while ((m->len1 > 0) && (toUpper[(int)a[p]] != complementSymbol[toUpper[(int)b[q]]])) {
+ m->pos1++;
+ m->len1--;
+ m->len2--;
+ p++;
+ q--;
+ }
+
+ a = Acache.getSequenceInCore(m->iid1)->sequence() + m->pos1;
+ b = Bcache.getSequenceInCore(m->iid2)->sequence() + m->pos2;
+ p = m->len1 - 1;
+ q = 0;
+ while ((m->len1 > 0) && (toUpper[(int)a[p]] != complementSymbol[toUpper[(int)b[q]]])) {
+ m->len1--;
+ m->pos2++;
+ m->len2--;
+ p--;
+ q++;
+ }
+ }
+
+ if (m->len1 > 0) {
+ char *a = Acache.getSequenceInCore(m->iid1)->sequence() + m->pos1;
+ char *b = Bcache.getSequenceInCore(m->iid2)->sequence() + m->pos2;
+
+ if (m->fwd2) {
+ for (uint32 p=0; p<m->len1; p++) {
+ if (toUpper[(int)a[p]] == toUpper[(int)b[p]])
+ identities++;
+ }
+ } else {
+ for (uint32 p=0, q=m->len2-1; p<m->len1; p++, q--) {
+ if (toUpper[(int)a[p]] == toUpper[complementSymbol[(int)b[q]]])
+ identities++;
+ }
+ }
+
+ double myIdentity = (double)identities / m->len1;
+
+ if ((myIdentity > discardThreshold) && (m->len1 > discardLength)) {
+ m->print(stdout, AF.labelA(), AF.labelB());
+ } else {
+ fprintf(stderr, "match "uint32FMT" is only %6.2f%% identity and "uint32FMT" long: ",
+ i, 100.0 * identities / m->len1, m->len1);
+ m->print(stderr, AF.labelA(), AF.labelB());
+ if (m->len1 < 200) {
+ char tmp[1000];
+
+ strncpy(tmp, a, m->len1);
+ tmp[m->len1] = 0;
+ fprintf(stderr, " %s\n", tmp);
+
+ strncpy(tmp, b, m->len1);
+ tmp[m->len1] = 0;
+ fprintf(stderr, " %s\n", tmp);
+ }
+ }
+ }
+ }
+
+ return(0);
+}
diff --git a/atac-driver/gapShifter/coalesceMatches.C b/atac-driver/gapShifter/coalesceMatches.C
new file mode 100644
index 0000000..ad3b0f7
--- /dev/null
+++ b/atac-driver/gapShifter/coalesceMatches.C
@@ -0,0 +1,71 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+// Reads a set of matches, coalesces those on the same diagonal.
+// Does not preserve runs.
+//
+// No args, reads stdin, writes stdout.
+
+int
+main(int argc, char *argv[]) {
+ atacFile AF("-");
+ atacMatchOrder MO(*AF.matches());
+ atacMatch *l = 0L;
+ atacMatch *r = 0L;
+
+ MO.sortDiagonal();
+
+ for (uint32 i=1; i<MO.numMatches(); i++) {
+ l = MO[i-1];
+ r = MO[i];
+
+ if ((l->iid1 == r->iid1) &&
+ (l->iid2 == r->iid2) &&
+ (l->fwd1 == r->fwd1) &&
+ (l->fwd2 == r->fwd2) &&
+ (l->pos1 + l->len1 == r->pos1) &&
+ (l->pos2 + l->len2 == r->pos2) &&
+ (strcmp(l->type, r->type) == 0) &&
+ (strcmp(l->parentuid, r->parentuid) == 0)) {
+
+ fprintf(stderr, "MERGE:\n");
+ l->print(stderr, AF.labelA(), AF.labelB());
+ r->print(stderr, AF.labelA(), AF.labelB());
+
+ l->len1 += r->len1;
+ l->len2 += r->len2;
+
+ l->print(stderr, AF.labelA(), AF.labelB());
+ } else {
+ l->print(stdout, AF.labelA(), AF.labelB());
+ l = 0L;
+ }
+ }
+
+ if (l)
+ l->print(stdout, AF.labelA(), AF.labelB());
+
+ return(0);
+}
diff --git a/atac-driver/gapShifter/correctGaps.C b/atac-driver/gapShifter/correctGaps.C
new file mode 100644
index 0000000..92a8224
--- /dev/null
+++ b/atac-driver/gapShifter/correctGaps.C
@@ -0,0 +1,301 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+
+void
+usage(char *name) {
+ fprintf(stderr, "usage: %s [] -m matches -l log\n", name);
+ fprintf(stderr, " When it works, fill this in...\n");
+}
+
+
+int
+main(int argc, char *argv[]) {
+ char *matchesFile = 0L;
+ FILE *logFile = 0L;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-m") == 0) {
+ matchesFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-l") == 0) {
+ errno = 0;
+ logFile = fopen(argv[++arg], "w");
+ if (errno)
+ fprintf(stderr, "Failed to open logfile '%s': %s\n", strerror(errno), argv[arg]), exit(1);
+ } else {
+ usage(argv[0]);
+ exit(1);
+ }
+ arg++;
+ }
+
+ if (matchesFile == 0L)
+ usage(argv[0]), exit(1);
+ if (logFile == 0L)
+ usage(argv[0]), exit(1);
+
+ atacFile AF(matchesFile);
+ atacMatchList &ML = *AF.matches();
+ atacMatchOrder MO(ML);
+
+ // Sort by either axis.
+ MO.sortA();
+
+ // We need to compute the identity of the gap; our metric (thanks to Nelson) is
+ // if ("long" and "not low identity") or ("short"), close the gap
+
+ // We could use the seqCache, but with only a handful of gaps, we
+ // just let the OS cache stuff.
+
+ seqCache *C1 = new seqCache(AF.assemblyFileA(), 2, false);
+ seqCache *C2 = new seqCache(AF.assemblyFileB(), 1024, false);
+
+ seqInCore *S1 = 0L;
+ seqInCore *S2 = 0L;
+
+ for (uint32 iter=0; iter<10; iter++) {
+ uint32 gapsize = 1000;
+ uint32 fgaps = 0;
+ uint32 rgaps = 0;
+
+ int mergeuid = 1;
+
+ for (uint32 i=1; i<MO.numMatches(); i++) {
+ atacMatch *l = MO[i-1];
+ atacMatch *r = MO[i];
+
+ bool joinMatches = false;
+
+ uint32 gap1 = 0;
+ uint32 gap2 = 0;
+
+ if ((l->iid1 == r->iid1) && // Matches are between the same sequences
+ (l->iid2 == r->iid2) &&
+ (l->fwd2 == r->fwd2)) { // Matches are the same orientation
+
+
+ if (l->fwd2 == true) {
+ if ((l->pos1 + l->len1 <= r->pos1) && // Matches are ordered correctly (should be, from the sort)
+ (l->pos2 + l->len2 <= r->pos2)) {
+
+ gap1 = r->pos1 - (l->pos1 + l->len1);
+ gap2 = r->pos2 - (l->pos2 + l->len2);
+
+ if ((gap1 == gap2) &&
+ (gap1 <= gapsize)) {
+ S1 = C1->getSequenceInCore(l->iid1);
+ S2 = C2->getSequenceInCore(l->iid2);
+
+ char *s1 = S1->sequence() + l->pos1 + l->len1;
+ char *s2 = S2->sequence() + l->pos2 + l->len1;
+
+ uint32 identities = 0;
+ uint32 n1 = 0;
+ uint32 n2 = 0;
+ for (uint32 p=0; p<gap1; p++) {
+ if (toUpper[(int)s1[p]] == toUpper[(int)s2[p]])
+ identities++;
+ if (toUpper[(int)s1[p]] == 'N')
+ n1++;
+ if (toUpper[(int)s2[p]] == 'N')
+ n2++;
+ }
+
+ if ((100*n1 < 20*gap1) && // gap is not N and
+ (100*n2 < 20*gap1) && // gap is not N and
+ ((100*identities < 80*gap1) || // (gap is high identity
+ ((gap1 < 11) && (2*gap1 <= l->len1) && (2*gap1 <= r->len1)) || // (gap is short, and the flanks are big
+ ((gap1 < 11) && (100*identities < 90*gap1)))) { // (gap is short and high quality
+
+ // ALSO need to check that the gap is not actually
+ // mapped in sequence 2. Not really, just make sure
+ // these two matches are in the same run.
+ //
+ if (strcmp(l->parentuid, r->parentuid) != 0) {
+ fprintf(logFile, "HEY! F gap of size "uint32FMT" not in a run?\n", gap1);
+ l->print(logFile, AF.labelA(), AF.labelB());
+ r->print(logFile, AF.labelA(), AF.labelB());
+ } else {
+ fgaps++;
+
+ joinMatches = true;
+
+ //fprintf(logFile, "potential f gap of size L "uint32FMTW(4)" (n1="uint32FMTW(4)" n2="uint32FMTW(4)" ident="uint32FMTW(4)"/"uint32FMTW(4)")!\n",
+ // gap1, n1, n2, identities, gap1);
+ //l->print(logFile, AF.labelA(), AF.labelB());
+ //r->print(logFile, AF.labelA(), AF.labelB());
+ }
+ }
+ }
+ }
+ } // was a forward match
+
+
+
+ if (l->fwd2 == false) {
+ if ((l->pos1 + l->len1 <= r->pos1) && // Matches are ordered correctly (should be, from the sort)
+ (r->pos2 + r->len2 <= l->pos2)) {
+
+ gap1 = r->pos1 - (l->pos1 + l->len1);
+ gap2 = l->pos2 - (r->pos2 + r->len2);
+
+ if ((gap1 == gap2) &&
+ (gap1 <= gapsize)) {
+
+ S1 = C1->getSequenceInCore(l->iid1);
+ S2 = C2->getSequenceInCore(l->iid2);
+
+ char *s1 = S1->sequence() + l->pos1 + l->len1;
+ char *s2 = S2->sequence() + r->pos2 + r->len2;
+
+ uint32 identities = 0;
+ uint32 n1 = 0;
+ uint32 n2 = 0;
+ for (uint32 p=0, q=gap1-1; p<gap1; p++, q--) {
+ if (toUpper[(int)s1[p]] == toUpper[complementSymbol[(int)s2[q]]])
+ identities++;
+ if (toUpper[(int)s1[p]] == 'N')
+ n1++;
+ if (toUpper[(int)s2[q]] == 'N')
+ n2++;
+ }
+
+ // Gap is short, flanks are big
+ // Gap is short, flanks are short and gap is good
+
+ if ((100*n1 < 20*gap1) && // gap is not N and
+ (100*n2 < 20*gap1) && // gap is not N and
+ ((100*identities < 80*gap1) || // (gap is high identity
+ ((gap1 < 11) && (2*gap1 <= l->len1) && (2*gap1 <= r->len1)) || // (gap is short, and the flanks are big
+ ((gap1 < 11) && (100*identities < 90*gap1)))) { // (gap is short and high quality
+
+ // ALSO need to check that the gap is not actually
+ // mapped in sequence 2. Not really, just make sure
+ // these two matches are in the same run.
+ //
+ if (strcmp(l->parentuid, r->parentuid) != 0) {
+ fprintf(logFile, "HEY! R gap of size "uint32FMT" not in a run?\n", gap1);
+ l->print(logFile, AF.labelA(), AF.labelB());
+ r->print(logFile, AF.labelA(), AF.labelB());
+ } else {
+ rgaps++;
+
+ joinMatches = true;
+
+ //fprintf(logFile, "potential r gap of size L "uint32FMTW(4)" (n1="uint32FMTW(4)" n2="uint32FMTW(4)" ident="uint32FMTW(4)"/"uint32FMTW(4)")!\n",
+ // gap1, n1, n2, identities, gap1);
+ //l->print(logFile, AF.labelA(), AF.labelB());
+ //r->print(logFile, AF.labelA(), AF.labelB());
+ }
+ }
+
+
+ }
+ }
+ }
+ }
+
+ if (joinMatches) {
+ fprintf(logFile, "CLOSE "uint32FMT"----------------------------------------\n", gap1);
+ l->print(logFile, AF.labelA(), AF.labelB());
+ r->print(logFile, AF.labelA(), AF.labelB());
+
+ MO.mergeMatches(l, r, mergeuid);
+
+ l->print(logFile, AF.labelA(), AF.labelB());
+
+ mergeuid++;
+ i--;
+ }
+ }
+
+ fprintf(logFile, "At gapSize="uint32FMT" closed "uint32FMT" f-gaps and "uint32FMT" r-gaps.\n", gapsize, fgaps, rgaps);
+
+ if (fgaps + rgaps == 0)
+ iter = 10;
+ }
+
+
+#if 0
+ // This analyzes an atac mapping, looking for a signature that indicates a bad
+ // alignment. If we have an alignment of:
+ // XXXXXXaC-YYYYYY
+ // XXXXXX-CtYYYYYY
+ // this will generate three matches, instead of one match with mismatches in it.
+ // We scan the FORWARD matches for this pattern, and report any we find.
+ //
+ // We only found 3 on huref4 vs b35. Further development here was stopped.
+
+ for (uint32 i=2; i<ML.numMatches(); i++) {
+ atacMatch *l = ML[i-2];
+ atacMatch *m = ML[i-1];
+ atacMatch *r = ML[i];
+
+ if (m->len1 < 3) { // The match in the middle is small
+
+ if ((l->iid1 == r->iid1) && // Matches are between the same sequences
+ (l->iid2 == r->iid2) &&
+ (l->fwd2 == r->fwd2)) { // Matches are the same orientation
+
+ if (l->fwd2 == true) {
+ if ((l->pos1 + l->len1 <= r->pos1) && // Matches are ordered correctly (should be, from the sort)
+ (l->pos2 + l->len2 <= r->pos2)) {
+
+ uint32 gapl1 = m->pos1 - (l->pos1 + l->len1);
+ uint32 gapl2 = m->pos2 - (l->pos2 + l->len2);
+ uint32 gapr1 = r->pos1 - (m->pos1 + m->len1);
+ uint32 gapr2 = r->pos2 - (m->pos2 + m->len2);
+
+ if ((gapl1 + gapr1 == gapl2 + gapr2) && (gapl1 + gapr1 < 5)) {
+ fprintf(logFile, "potential f fix of size L "uint32FMT" "uint32FMT" and R "uint32FMT" "uint32FMT"!\n",
+ gapl1, gapl2, gapr1, gapr2);
+ l->print(logFile, "A", "B");
+ m->print(logFile, "A", "B");
+ r->print(logFile, "A", "B");
+ }
+ } else {
+ fprintf(logFile, "sort is forward broken.\n");
+ }
+ } // was a forward match
+ }
+ }
+ }
+#endif
+
+
+ // Write the new output to stdout -- we preserve runs here, but
+ // discard everything else.
+ //
+ AF.writeHeader(stdout);
+
+ for (uint32 i=0; i<MO.numMatches(); i++)
+ MO[i]->print(stdout, AF.labelA(), AF.labelB());
+
+ for (uint32 i=0; i<AF.runs()->numberOfMatches(); i++)
+ AF.runs()->getMatch(i)->print(stdout, AF.labelA(), AF.labelB());
+
+ return(0);
+}
diff --git a/atac-driver/gapShifter/extractSequence.C b/atac-driver/gapShifter/extractSequence.C
new file mode 100644
index 0000000..a6858fe
--- /dev/null
+++ b/atac-driver/gapShifter/extractSequence.C
@@ -0,0 +1,135 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "atac.H"
+#include "bio++.H"
+#include "seqCache.H"
+
+// Reads a set of matches and outputs sequence that was mapped. Filters matches, etc.
+
+void
+extractA(seqCache *A, seqCache *B,
+ FILE *Aoutput, FILE *Boutput,
+ uint32 Aiid,
+ uint32 Biid,
+ atacMatchList &ML) {
+
+}
+
+
+void
+usage(char *name) {
+ fprintf(stderr, "usage: %s [-OP output.fasta] [-t trfile] -m matches\n", name);
+ fprintf(stderr, " OP\n");
+ fprintf(stderr, " -a extract all unmapped sequence in A\n");
+ fprintf(stderr, " -b extract all unmapped sequence in B\n");
+ fprintf(stderr, " -ar extract within run unmapped sequence in A\n");
+ fprintf(stderr, " -br extract within run unmapped sequence in B\n");
+ fprintf(stderr, " BOTH -ar and -br need to be specified!\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -t mask out tandem repeats listed in trfile\n");
+}
+
+FILE *
+openOutputFile(char *name) {
+ errno = 0;
+ FILE *R = fopen(name, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", name, strerror(errno)), exit(1);
+ return(R);
+}
+
+int
+main(int argc, char *argv[]) {
+ char *matchesFile = 0L;
+ FILE *Aoutput = 0L;
+ FILE *Boutput = 0L;
+ uint32 Aiid = ~uint32ZERO;
+ uint32 Biid = ~uint32ZERO;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-m") == 0) {
+ matchesFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-a") == 0) {
+ Aoutput = openOutputFile(argv[++arg]);
+ } else if (strcmp(argv[arg], "-b") == 0) {
+ Boutput = openOutputFile(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-1") == 0) {
+ Aiid = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-2") == 0) {
+ Biid = strtouint32(argv[++arg], 0L);
+ } else {
+ usage(argv[0]);
+ exit(1);
+ }
+ arg++;
+ }
+
+ if (matchesFile == 0L)
+ usage(argv[0]), exit(1);
+
+ atacFile AF(matchesFile);
+ atacMatchList &ML = *AF.matches();
+
+ seqCache *A = new seqCache(AF.assemblyFileA(), 0, true);
+ seqCache *B = new seqCache(AF.assemblyFileB(), 0, true);
+
+ A->loadAllSequences();
+ B->loadAllSequences();
+
+ for (uint32 x=0; x<ML.numMatches(); x++) {
+ atacMatch *m = ML[x];
+
+ if (((Aiid == ~uint32ZERO) || (Aiid == m->iid1)) &&
+ ((Biid == ~uint32ZERO) || (Biid == m->iid2))) {
+
+ if (Aoutput) {
+ seqInCore *S = A->getSequenceInCore(m->iid1);
+
+ fprintf(Aoutput, "%s extracted from iid "uint32FMT" pos "uint32FMT" "uint32FMT" match %s(%s)\n",
+ S->header(), S->getIID(),
+ m->pos1, m->pos1 + m->len1,
+ m->matchuid, m->parentuid);
+ fwrite(S->sequence() + m->pos1, sizeof(char), m->len1, Aoutput);
+ fprintf(Aoutput, "\n");
+ }
+
+ if (Boutput) {
+ seqInCore *S = B->getSequenceInCore(m->iid2);
+
+ fprintf(Boutput, "%s extracted from iid "uint32FMT" pos "uint32FMT" "uint32FMT" match %s(%s)\n",
+ S->header(), S->getIID(),
+ m->pos2, m->pos2 + m->len2,
+ m->matchuid, m->parentuid);
+ fwrite(S->sequence() + m->pos2, sizeof(char), m->len2, Boutput);
+ fprintf(Boutput, "\n");
+ }
+ }
+ }
+
+ if (Aoutput) fclose(Aoutput);
+ if (Boutput) fclose(Boutput);
+
+ return(0);
+}
diff --git a/atac-driver/gapShifter/extractUnmapped-sim4dbFixer.pl b/atac-driver/gapShifter/extractUnmapped-sim4dbFixer.pl
new file mode 100644
index 0000000..997cb1c
--- /dev/null
+++ b/atac-driver/gapShifter/extractUnmapped-sim4dbFixer.pl
@@ -0,0 +1,63 @@
+#!/usr/bin/perl
+
+# Fixes up the IID's and coords of the sim4db mapped regions from extractUnmapped.
+# Reads polishes from stdin, writes fixed polishes to stdout.
+
+use strict;
+use lib "/bioinfo/assembly/walenz/src/genomics/scripts";
+use sim4polish;
+$| = 1;
+
+while (!eof(STDIN)) {
+ my %p = &sim4polish::readPolish(*STDIN);
+
+ if ($p{'raw'}) {
+ my $estIID;
+ my $estBeg = 0;
+ my $estEnd = 0;
+
+ my $dbIID;
+ my $dbBeg = 0;
+ my $dbEnd = 0;
+
+ if ($p{'estDefLine'} =~ m/extracted\s+from\s+iid\s+(\d+)\s+pos\s+(\d+)\s+(\d+)\s+/) {
+ $estIID = $1;
+ $estBeg = $2;
+ $estEnd = $3;
+ }
+
+ if ($p{'dbDefLine'} =~ m/extracted\s+from\s+iid\s+(\d+)\s+pos\s+(\d+)\s+(\d+)\s+/) {
+ $dbIID = $1;
+ $dbBeg = $2;
+ $dbEnd = $3;
+ }
+
+ if (defined($estIID)) {
+ $p{'estID'} = $estIID;
+ foreach my $exon (@{@p{'exons'}}) {
+ $exon->{'cDNAstart'} += $estBeg;
+ $exon->{'cDNAend'} += $estBeg;
+ }
+ }
+ if (defined($dbIID)) {
+ $p{'dbID'} = $dbIID;
+ foreach my $exon (@{@p{'exons'}}) {
+ $exon->{'GENOMICstart'} += $dbBeg;
+ $exon->{'GENOMICend'} += $dbBeg;
+ }
+ }
+
+ # normalize
+ foreach my $exon (@{@p{'exons'}}) {
+ $exon->{'GENOMICstart'} += $p{'dbLo'};
+ $exon->{'GENOMICend'} += $p{'dbLo'};
+ }
+
+ $p{'dbLo'} = 0;
+ $p{'dbHi'} = 0;
+ $p{'estLen'} = 0;
+
+ $p{'raw'} = &sim4polish::updatePolish(%p);
+ print $p{'raw'};
+ }
+}
diff --git a/atac-driver/gapShifter/extractUnmapped.C b/atac-driver/gapShifter/extractUnmapped.C
new file mode 100644
index 0000000..abb6fbf
--- /dev/null
+++ b/atac-driver/gapShifter/extractUnmapped.C
@@ -0,0 +1,600 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+
+
+//
+// Tested to work on ../../atac/B36LCvsHUREF6A/B36LCvsHUREF6A.gapsFixed.atac
+//
+
+
+
+// Reads a set of matches and outputs two sequence files containing sequence that
+// is not matched.
+
+
+void
+writeGaplessSequence(FILE *output,
+ seqInCore *S,
+ uint32 beg,
+ uint32 end,
+ uint32 extend,
+ atacMatch *l,
+ atacMatch *r) {
+ char *s = S->sequence();
+
+
+ // Skip any N's starting where we are currently
+ //
+ while ((beg < end) &&
+ (toUpper[(int)s[beg]] == 'N'))
+ beg++;
+
+ while ((beg < end) &&
+ (toUpper[(int)s[end-1]] == 'N'))
+ end--;
+
+ if (beg >= end)
+ return;
+
+ // Extend the ends up to 'extend' positions, as long as we don't
+ // hit a gap.
+ //
+ for (uint32 x=0; ((x < extend) &&
+ (beg > 0) &&
+ (toUpper[(int)s[beg-1]] != 'N')); x++)
+ beg--;
+
+ for (uint32 x=0; ((x < extend) &&
+ (end < S->sequenceLength()) &&
+ (toUpper[(int)s[end]] != 'N')); x++)
+ end++;
+
+ // Just make sure we're still in bounds!
+ if (end > S->sequenceLength())
+ end = S->sequenceLength();
+
+ // Over the whole sequence
+ //
+ while (beg < end) {
+
+ // Skip any N's starting where we are currently
+ //
+ while ((beg < end) &&
+ (toUpper[(int)s[beg]] == 'N'))
+ beg++;
+
+ // Move our current up to here
+ uint32 cur = beg;
+
+ // If we're at the end of the sequence, this block doesn't
+ // exist; it's solid N.
+ //
+ if (beg < end) {
+
+ // Move cur up to the next N
+ //
+ while ((cur < end) &&
+ (toUpper[(int)s[cur]] != 'N'))
+ cur++;
+
+ // And output whatever this block is
+ //
+ fprintf(output, "%s extracted from iid "uint32FMT" pos "uint32FMT" "uint32FMT" between match %s(%s) and %s(%s)\n",
+ S->header(), S->getIID(), beg, cur,
+ (l) ? l->matchuid : "none",
+ (l) ? l->parentuid : "none",
+ (r) ? r->matchuid : "none",
+ (r) ? r->parentuid : "none");
+
+ fwrite(S->sequence() + beg, sizeof(char), cur-beg, output);
+ fprintf(output, "\n");
+
+ }
+
+ // Move to the next block.
+ beg = cur;
+ }
+}
+
+
+
+
+
+// COPIED from libatac/matchList.C, but need to dereference the atacMatch again.
+//
+static
+int
+sort1_(const void *a, const void *b) {
+ const atacMatch *A = *(const atacMatch * const *)a;
+ const atacMatch *B = *(const atacMatch * const *)b;
+
+ if (A->iid1 < B->iid1) return(-1);
+ if (A->iid1 > B->iid1) return(1);
+ if (A->pos1 < B->pos1) return(-1);
+ if (A->pos1 > B->pos1) return(1);
+ if (A->len1 > B->len1) return(-1);
+ if (A->len1 < B->len1) return(1);
+ if (A->iid2 < B->iid2) return(-1);
+ if (A->iid2 > B->iid2) return(1);
+ if (A->pos2 < B->pos2) return(-1);
+ if (A->pos2 > B->pos2) return(1);
+ if (A->len2 > B->len2) return(-1);
+ if (A->len2 < B->len2) return(1);
+ return(0);
+}
+
+static
+int
+sort2_(const void *a, const void *b) {
+ const atacMatch *A = *(const atacMatch * const *)a;
+ const atacMatch *B = *(const atacMatch * const *)b;
+
+ if (A->iid2 < B->iid2) return(-1);
+ if (A->iid2 > B->iid2) return(1);
+ if (A->pos2 < B->pos2) return(-1);
+ if (A->pos2 > B->pos2) return(1);
+ if (A->len2 > B->len2) return(-1);
+ if (A->len2 < B->len2) return(1);
+ if (A->iid1 < B->iid1) return(-1);
+ if (A->iid1 > B->iid1) return(1);
+ if (A->pos1 < B->pos1) return(-1);
+ if (A->pos1 > B->pos1) return(1);
+ if (A->len1 > B->len1) return(-1);
+ if (A->len1 < B->len1) return(1);
+
+ return(0);
+}
+
+
+
+// New method, uses an intervalList to find the unmapped regions for
+// each sequence.
+//
+class extractMatchList {
+
+public:
+ extractMatchList() {
+ matchesLen = 0;
+ matchesMax = 16;
+ matches = new atacMatch * [matchesMax];
+ };
+ ~extractMatchList() {
+ delete [] matches;
+ };
+
+ atacMatch *operator[](uint32 idx) {
+ return(matches[idx]);
+ };
+
+ uint32 len(void) {
+ return(matchesLen);
+ };
+
+ void add(atacMatch *m) {
+ if (matchesLen >= matchesMax) {
+ matchesMax *= 2;
+ atacMatch **M = new atacMatch * [matchesMax];
+ memcpy(M, matches, sizeof(atacMatch *) * matchesLen);
+ delete [] matches;
+ matches = M;
+ }
+ matches[matchesLen++] = m;
+ };
+
+ void sort1(void) {
+ qsort(matches, matchesLen, sizeof(atacMatch*), sort1_);
+ };
+ void sort2(void) {
+ qsort(matches, matchesLen, sizeof(atacMatch*), sort2_);
+ };
+
+private:
+ atacMatch **matches;
+ uint32 matchesLen;
+ uint32 matchesMax;
+};
+
+
+
+
+void
+extractUnmapped(seqCache *A, seqCache *B,
+ FILE *Aoutput, FILE *Boutput,
+ uint32 extend,
+ atacFile &AF,
+ atacMatchList &ML) {
+ uint32 numSeqsA = AF.fastaA()->getNumberOfSequences();
+ uint32 numSeqsB = AF.fastaB()->getNumberOfSequences();
+
+ extractMatchList *coveredA = new extractMatchList [numSeqsA];
+ extractMatchList *coveredB = new extractMatchList [numSeqsB];
+
+ // Populate the intervals with the mapping
+ //
+ for (uint32 x=0; x<ML.numMatches(); x++) {
+ atacMatch *m = ML[x];
+
+ coveredA[m->iid1].add(m);
+ coveredB[m->iid2].add(m);
+ }
+
+ // Sort the intervals, manually invert the interval -- remembering
+ // what matches are where.
+ //
+ for (uint32 seq=0; seq<numSeqsA; seq++) {
+ coveredA[seq].sort1();
+
+ //ML.fastaA()->find(seq);
+ //seqInCore *S = ML.fastaA()->getSequenceInCore();
+
+ seqInCore *S = A->getSequenceInCore(seq);
+
+ if (coveredA[seq].len() == 0) {
+ // Hey! This sequence has NO matches at all!
+ //
+ writeGaplessSequence(Aoutput,
+ S,
+ 0,
+ AF.fastaA()->getSequenceLength(seq),
+ extend,
+ 0L, 0L);
+ } else {
+ if (0 < coveredA[seq][0]->pos1) {
+ writeGaplessSequence(Aoutput,
+ S,
+ 0,
+ coveredA[seq][0]->pos1,
+ extend,
+ 0L, coveredA[seq][0]);
+ }
+
+ for (uint32 i=1; i<coveredA[seq].len(); i++) {
+ if (coveredA[seq][i-1]->pos1 + coveredA[seq][i-1]->len1 < coveredA[seq][i]->pos1) {
+ writeGaplessSequence(Aoutput,
+ S,
+ coveredA[seq][i-1]->pos1 + coveredA[seq][i-1]->len1,
+ coveredA[seq][i]->pos1,
+ extend,
+ coveredA[seq][i-1], coveredA[seq][i]);
+ }
+ }
+
+ uint32 last = coveredA[seq].len()-1;
+ if (coveredA[seq][last]->pos1) {
+ writeGaplessSequence(Aoutput,
+ S,
+ coveredA[seq][last]->pos1 + coveredA[seq][last]->len1,
+ AF.fastaA()->getSequenceLength(seq),
+ extend,
+ coveredA[seq][0], 0L);
+ }
+ }
+ }
+
+
+
+ // DUPLICATION OF THE ABOVE! (Replace 1 with 2, A with B)
+
+
+ // Sort the intervals, manually invert the interval -- remembering
+ // what matches are where.
+ //
+ for (uint32 seq=0; seq<numSeqsB; seq++) {
+ coveredB[seq].sort2();
+
+ seqInCore *S = B->getSequenceInCore(seq);
+
+ if (coveredB[seq].len() == 0) {
+ // Hey! This sequence has NO matches at all!
+ //
+ writeGaplessSequence(Boutput,
+ S,
+ 0,
+ AF.fastaB()->getSequenceLength(seq),
+ extend,
+ 0L, 0L);
+ } else {
+ if (0 < coveredB[seq][0]->pos2) {
+ writeGaplessSequence(Boutput,
+ S,
+ 0,
+ coveredB[seq][0]->pos2,
+ extend,
+ 0L, coveredB[seq][0]);
+ }
+
+ for (uint32 i=1; i<coveredB[seq].len(); i++) {
+ if (coveredB[seq][i-1]->pos2 + coveredB[seq][i-1]->len2 < coveredB[seq][i]->pos2) {
+ writeGaplessSequence(Boutput,
+ S,
+ coveredB[seq][i-1]->pos2 + coveredB[seq][i-1]->len2,
+ coveredB[seq][i]->pos2,
+ extend,
+ coveredB[seq][i-1], coveredB[seq][i]);
+ }
+ }
+
+ uint32 last = coveredB[seq].len()-1;
+ if (coveredB[seq][last]->pos2) {
+ writeGaplessSequence(Boutput,
+ S,
+ coveredB[seq][last]->pos2 + coveredB[seq][last]->len2,
+ AF.fastaB()->getSequenceLength(seq),
+ extend,
+ coveredB[seq][0], 0L);
+ }
+ }
+ }
+
+
+
+
+
+
+
+
+
+}
+
+
+void
+extractUnmappedRuns(seqCache *A, seqCache *B,
+ FILE *ARoutput, FILE *BRoutput,
+ uint32 extend,
+ atacMatchList &ML) {
+ seqInCore *S1 = 0L;
+ seqInCore *S2 = 0L;
+
+ atacMatchOrder MO(ML);
+ MO.sortA();
+
+ for (uint32 i=1; i<MO.numMatches(); i++) {
+ atacMatch *l = MO[i-1];
+ atacMatch *r = MO[i];
+
+ if (l->iid1 != r->iid1)
+ continue;
+ if (l->iid2 != r->iid2)
+ continue;
+
+ // Extract from (l->pos1 + l->len1) to (r->pos1), if it's longer than 20bp
+
+ bool lengthOK = true;
+ if (l->pos1 + l->len1 + 20 >= r->pos1)
+ lengthOK = false;
+ if ((l->fwd2 == true) && (l->pos2 + l->len2 + 20 >= r->pos2))
+ lengthOK = false;
+ if ((l->fwd2 == false) && (r->pos2 + r->len2 + 20 >= l->pos2))
+ lengthOK = false;
+
+ // Extract if our two matches are in the same run.
+ //
+ if ((lengthOK) &&
+ (strcmp(l->parentuid, r->parentuid) == 0)) {
+
+#if 0
+ if (l->iid1 != S1->getIID()) {
+ delete S1;
+ W1->find(l->iid1);
+ S1 = W1->getSequenceInCore();
+ }
+
+ if (l->iid2 != S2->getIID()) {
+ delete S2;
+ W2->find(l->iid2);
+ S2 = W2->getSequenceInCore();
+ }
+#else
+ S1 = A->getSequenceInCore(l->iid1);
+ S2 = B->getSequenceInCore(l->iid2);
+#endif
+
+ writeGaplessSequence(ARoutput,
+ S1,
+ l->pos1 + l->len1,
+ r->pos1,
+ extend,
+ l, r);
+
+ // Need to deal with reverse matches here! In run matches
+ // should be the same orientation, but we'll still check.
+ //
+ if (l->fwd2 != r->fwd2) {
+ fprintf(stderr, "WOAH! Matches of different orientation in a run?!?\n");
+ exit(1);
+ }
+
+ if (l->fwd2) {
+ writeGaplessSequence(BRoutput,
+ S2,
+ l->pos2 + l->len2,
+ r->pos2,
+ extend,
+ l, r);
+ } else {
+ writeGaplessSequence(BRoutput,
+ S2,
+ r->pos2 + r->len2,
+ l->pos2,
+ extend,
+ l, r);
+ }
+ }
+ }
+}
+
+
+
+
+
+
+
+
+void
+usage(char *name) {
+ fprintf(stderr, "usage: %s [-OP output.fasta] [-t trfile] -m matches\n", name);
+ fprintf(stderr, " OP\n");
+ fprintf(stderr, " -a extract all unmapped sequence in A\n");
+ fprintf(stderr, " -b extract all unmapped sequence in B\n");
+ fprintf(stderr, " -ar extract within run unmapped sequence in A\n");
+ fprintf(stderr, " -br extract within run unmapped sequence in B\n");
+ fprintf(stderr, " BOTH -ar and -br need to be specified!\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -t mask out tandem repeats listed in trfile\n");
+}
+
+FILE *
+openOutputFile(char *name) {
+ errno = 0;
+ FILE *R = fopen(name, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", name, strerror(errno)), exit(1);
+ return(R);
+}
+
+int
+main(int argc, char *argv[]) {
+ char *matchesFile = 0L;
+ FILE *Aoutput = 0L;
+ FILE *Boutput = 0L;
+ FILE *ARoutput = 0L;
+ FILE *BRoutput = 0L;
+ uint32 extend = 0;
+ char *trFile = 0L;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-m") == 0) {
+ matchesFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-a") == 0) {
+ Aoutput = openOutputFile(argv[++arg]);
+ } else if (strcmp(argv[arg], "-b") == 0) {
+ Boutput = openOutputFile(argv[++arg]);
+ } else if (strcmp(argv[arg], "-ar") == 0) {
+ ARoutput = openOutputFile(argv[++arg]);
+ } else if (strcmp(argv[arg], "-br") == 0) {
+ BRoutput = openOutputFile(argv[++arg]);
+ } else if (strcmp(argv[arg], "-e") == 0) {
+ extend = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-t") == 0) {
+ trFile = argv[++arg];
+ } else {
+ usage(argv[0]);
+ exit(1);
+ }
+ arg++;
+ }
+
+ if (matchesFile == 0L)
+ usage(argv[0]), exit(1);
+
+ atacFile AF(matchesFile);
+ atacMatchList &ML = *AF.matches();
+
+ // Build caches for both sequences, then modify that sequence to
+ // mask out tandem repeats.
+ //
+ seqCache *A = new seqCache(AF.assemblyFileA(), 0, true);
+ seqCache *B = new seqCache(AF.assemblyFileB(), 0, true);
+
+ A->loadAllSequences();
+ B->loadAllSequences();
+
+ if (trFile) {
+ errno =0;
+ FILE *F = fopen(trFile, "r");
+ if (errno)
+ fprintf(stderr, "Error opening '%s': %s\n", trFile, strerror(errno));
+
+ char L[1024] = { 0 };
+ splitToWords W(L);
+
+ fprintf(stderr, "Masking repeats in '%s'\n", trFile);
+
+ uint32 statidx = 0;
+ uint32 stats[2] = { 0 };
+
+ while (!feof(F)) {
+ fgets(L, 1024, F);
+ W.split(L);
+
+ char source = W[0][0];
+ uint32 iid = strtouint32(W[1], 0L);
+ uint32 pos = strtouint32(W[2], 0L);
+ uint32 len = strtouint32(W[3], 0L);
+ bool fwd = (W[4][0] != '-');
+
+ seqInCore *S = 0L;
+ char *s = 0L;
+
+ if (source == 'B') {
+ S = A->getSequenceInCore(iid);
+ s = A->getSequenceInCore(iid)->sequence();
+ statidx = 0;
+ } else if (source == 'H') {
+ S = B->getSequenceInCore(iid);
+ s = B->getSequenceInCore(iid)->sequence();
+ statidx = 1;
+ } else {
+ fprintf(stderr, "Unknown source '%c'\n", source);
+ exit(1);
+ }
+
+ //fprintf(stderr, "Masking %c "uint32FMTW(8)" from "uint32FMTW(9)" to "uint32FMTW(9)" on strand %c\r",
+ // source, iid, pos, pos+len, (fwd) ? 'f' : 'r');
+
+ if (fwd) {
+ s += pos;
+ } else {
+ s += S->sequenceLength() - pos - len;
+ }
+
+ for (uint32 i=0; i<len; i++) {
+ if (toUpper[(int)s[i]] != 'N')
+ stats[statidx]++;
+ s[i] = 'N';
+ }
+ }
+ fclose(F);
+
+ fprintf(stderr, "Done masking. "uint32FMT" in A, "uint32FMT" in B.\n", stats[0], stats[1]);
+ }
+
+
+ if (Aoutput && Boutput) {
+ extractUnmapped(A, B, Aoutput, Boutput, extend, AF, ML);
+ fclose(Aoutput);
+ fclose(Boutput);
+ }
+
+ if (ARoutput && BRoutput) {
+ extractUnmappedRuns(A, B, ARoutput, BRoutput, extend, ML);
+ fclose(ARoutput);
+ fclose(BRoutput);
+ }
+
+ return(0);
+}
diff --git a/atac-driver/gapShifter/gapShifter.C b/atac-driver/gapShifter/gapShifter.C
new file mode 100644
index 0000000..ab89572
--- /dev/null
+++ b/atac-driver/gapShifter/gapShifter.C
@@ -0,0 +1,787 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "atac.H"
+#include "bio++.H"
+#include "seqCache.H"
+
+#define MAXPRINT 90
+
+#if 0
+#define REPORT_RESULTS
+#define REPORT_UNSHIFTABLE
+#define REPORT_SHIFTING
+#endif
+
+// Global statistics, reset on each iteration
+//
+uint32 numShifted; // valid to be shifted, and were shifted
+uint32 numNotShifted; // but valid to be shifted
+uint32 numDiffSeq; // all the rest are not valid to be shifted
+uint32 numDiffOri;
+uint32 numZeroLen;
+uint32 numOutOfOrder;
+uint32 numNotAdjacent;
+uint32 numNoGap;
+uint32 numGapTooBig;
+uint32 numOverlapping;
+uint32 amountShifted[1024];
+
+FILE *logFile = 0L;
+
+// XXXXXX outer loop needs to skip empty matches!!
+
+// Reads a set of matches and shifts the location of all the gaps to
+// one side or the other.
+//
+// Example:
+//
+// A middle gap:
+// GGGGGGGGGGATATATATATATATATATATATATGGGGGGGGG
+// GGGGGGGGGGATAT--ATATATATATATATATATGGGGGGGGG
+//
+// A left-most gap:
+// GGGGGGGGGGATATATATATATATATATATATATGGGGGGGGG
+// GGGGGGGGGG--ATATATATATATATATATATATGGGGGGGGG
+//
+// A right-most gap:
+// GGGGGGGGGGATATATATATATATATATATATATGGGGGGGGG
+// GGGGGGGGGGATATATATATATATATATATAT--GGGGGGGGG
+//
+// Shifting is done for both assembly-axes.
+
+
+
+// Returns true if these two matches have a potentially shiftable gap
+// between them. Potentially shiftable means that the matches are
+// contiguous on one axis, and consecutive (no matches between) on
+// the other axis.
+//
+// Assume the matches are sorted by the first sequence.
+//
+bool
+isPotentiallyShiftable(atacMatch *ma,
+ atacMatch *mb,
+ atacMatchOrder &MOB,
+ uint32 gapLimit) {
+
+#ifdef REPORT_UNSHIFTABLE
+ fprintf(stderr, "isPotentiallyShiftable()\n");
+ ma->print(stderr, "A", "B");
+ mb->print(stderr, "A", "B");
+#endif
+
+ // Not shiftable if on different sequences
+ //
+ if ((ma->iid1 != mb->iid1) ||
+ (ma->iid2 != mb->iid2)) {
+#ifdef REPORT_UNSHIFTABLE
+ fprintf(stderr, "UNSHIFTABLE different sequences\n");
+#endif
+ numDiffSeq++;
+ return(false);
+ }
+
+ // Not shiftable if the orientation of the two matches is
+ // different. This is probably not a gap we want to muck with.
+ //
+ if ((ma->fwd1 != mb->fwd1) ||
+ (ma->fwd2 != mb->fwd2)) {
+#ifdef REPORT_UNSHIFTABLE
+ fprintf(stderr, "UNSHIFTABLE different orientation\n");
+#endif
+ numDiffOri++;
+ return(false);
+ }
+
+ // Not shiftable if any length is zero. This isn't a gap, it's a
+ // dead match.
+ //
+ if ((ma->len1 == 0) ||
+ (ma->len2 == 0) ||
+ (mb->len1 == 0) ||
+ (mb->len2 == 0)) {
+#ifdef REPORT_UNSHIFTABLE
+ fprintf(stderr, "UNSHIFTABLE zero length\n");
+#endif
+ numZeroLen++;
+ return(false);
+ }
+
+ atacMatch *bl = (ma->fwd2) ? ma : mb; // the left match on B, relative to forward orientation
+ atacMatch *br = (ma->fwd2) ? mb : ma; // the right
+
+ // Not shiftable if the B matches are out of order
+ //
+ if (bl->pos2 > br->pos2) {
+#ifdef REPORT_UNSHIFTABLE
+ fprintf(stderr, "UNSHIFTABLE misordered on B\n");
+#endif
+ numOutOfOrder++;
+ return(false);
+ }
+
+ uint32 magap = mb->pos1 - (ma->pos1 + ma->len1);
+ uint32 mbgap = br->pos2 - (bl->pos2 + bl->len2);
+
+ // Not shiftable if there is no zero size gap
+ //
+ if ((magap > 0) && (mbgap > 0)) {
+#ifdef REPORT_UNSHIFTABLE
+ fprintf(stderr, "UNSHIFTABLE no zero size gap ("uint32FMT", "uint32FMT")\n", magap, mbgap);
+#endif
+ numNotAdjacent++;
+ return(false);
+ }
+
+ // Not shiftabe if there is no gap
+ //
+ if ((magap == 0) && (mbgap == 0)) {
+#ifdef REPORT_UNSHIFTABLE
+ fprintf(stderr, "UNSHIFTABLE no gap on both sequences ("uint32FMT", "uint32FMT")\n", magap, mbgap);
+#endif
+ numNoGap++;
+ return(false);
+ }
+
+ // Not shiftable if the gap is big
+ //
+ if ((magap > gapLimit) || (mbgap > gapLimit)) {
+#ifdef REPORT_UNSHIFTABLE
+ fprintf(stderr, "UNSHIFTABLE gap too big ("uint32FMT", "uint32FMT")\n", magap, mbgap);
+#endif
+ numGapTooBig++;
+ return(false);
+ }
+
+ // Not shiftable if they overlap
+ //
+ if (ma->pos1 + ma->len1 > mb->pos1) {
+#ifdef REPORT_UNSHIFTABLE
+ fprintf(stderr, "UNSHIFTABLE overlap on sequence A\n");
+#endif
+ numOverlapping++;
+ return(false);
+ }
+ if (bl->pos2 + bl->len2 > br->pos2) {
+#ifdef REPORT_UNSHIFTABLE
+ fprintf(stderr, "UNSHIFTABLE overlap on sequence B\n");
+#endif
+ numOverlapping++;
+ return(false);
+ }
+
+ uint32 iid1 = ma->matchiid;
+ uint32 iid2 = mb->matchiid;
+
+ // Check that there isn't another match stuck in the middle on
+ // the B axis.
+ //
+ if (ma->fwd2 == true) {
+ if ((MOB.index(iid1)+1) != MOB.index(iid2)) {
+ fprintf(stderr, "WARNING: Match inbetween! (forward)\n");
+
+ fprintf(stderr, "iid1 "uint32FMT", iid2 "uint32FMT"\n", iid1, iid2);
+ ma->print(stderr, "A", "B");
+ mb->print(stderr, "A", "B");
+
+ fprintf(stderr, "before, iid1, after\n");
+ MOB[MOB.index(iid1)-1]->print(stderr, "A", "B");
+ MOB[MOB.index(iid1) ]->print(stderr, "A", "B");
+ MOB[MOB.index(iid1)+1]->print(stderr, "A", "B");
+
+ fprintf(stderr, "before, iid2, after\n");
+ MOB[MOB.index(iid2)-1]->print(stderr, "A", "B");
+ MOB[MOB.index(iid2) ]->print(stderr, "A", "B");
+ MOB[MOB.index(iid2)+1]->print(stderr, "A", "B");
+
+ return(false);
+ }
+ } else {
+ if ((MOB.index(iid1)-1) != MOB.index(iid2)) {
+ fprintf(stderr, "WARNING: Match inbetween! (reverse-complement)\n");
+
+ fprintf(stderr, "iid1 "uint32FMT", iid2 "uint32FMT"\n", iid1, iid2);
+ ma->print(stderr, "A", "B");
+ mb->print(stderr, "A", "B");
+
+ fprintf(stderr, "before, iid1, after\n");
+ MOB[MOB.index(iid1)-1]->print(stderr, "A", "B");
+ MOB[MOB.index(iid1) ]->print(stderr, "A", "B");
+ MOB[MOB.index(iid1)+1]->print(stderr, "A", "B");
+
+ fprintf(stderr, "before, iid2, after\n");
+ MOB[MOB.index(iid2)-1]->print(stderr, "A", "B");
+ MOB[MOB.index(iid2) ]->print(stderr, "A", "B");
+ MOB[MOB.index(iid2)+1]->print(stderr, "A", "B");
+
+ return(false);
+ }
+ }
+
+ return(true);
+}
+
+
+
+void
+dumpAgap(atacMatch *ma, atacMatch *mb,
+ atacMatchOrder &MOB,
+ seqCache *C1, seqCache *C2,
+ uint32 gapLimit,
+ bool shiftRight) {
+}
+
+
+void
+dumpBgap(atacMatch *ma, atacMatch *mb,
+ atacMatchOrder &MOB,
+ seqCache *C1, seqCache *C2,
+ uint32 gapLimit,
+ bool shiftRight) {
+}
+
+
+// Returns the beginning of the sequence from pos to pos+len
+char *
+getSequenceBeg(char *str, uint32 pos, uint32 len, FastAAccessor &it) {
+ uint32 i = 0;
+
+ it.setRange(pos, len);
+
+ if (len > MAXPRINT)
+ len = MAXPRINT;
+
+ if (len > 0) {
+ it.setPosition(pos);
+
+ for (i=0; i<len; i++, ++it)
+ str[i] = *it;
+ }
+ str[i] = 0;
+
+ return(str);
+}
+
+
+// Returns all sequence from pos to pos+len
+char *
+getSequenceAll(char *str, uint32 pos, uint32 len, FastAAccessor &it) {
+ uint32 i = 0;
+
+ it.setRange(pos, len);
+
+ if (len > 0) {
+ it.setPosition(pos);
+
+ for (i=0; i<len; i++, ++it)
+ str[i] = *it;
+ }
+ str[i] = 0;
+
+ return(str);
+}
+
+
+// Returns the end of the sequence from pos to pos+len
+char *
+getSequenceEnd(char *str, uint32 pos, uint32 len, FastAAccessor &it) {
+ uint32 i = 0;
+
+ it.setRange(pos, len);
+
+ pos += len;
+ if (len > MAXPRINT)
+ len = MAXPRINT;
+
+ if (len > 0) {
+ it.setPosition(pos - len);
+
+ for (i=0; i<len; i++, ++it)
+ str[i] = *it;
+ }
+ str[i] = 0;
+
+ return(str);
+}
+
+
+
+uint32
+shiftGap(atacFile &AF,
+ atacMatchList &ML,
+ atacMatch *ma, atacMatch *mb,
+ atacMatchOrder &MOB,
+ seqCache *C1, seqCache *C2,
+ uint32 gapLimit,
+ bool shiftRight) {
+
+#ifdef REPORT_RESULTS
+ fprintf(stderr, "----------------------------------------\n");
+#endif
+
+ if (isPotentiallyShiftable(ma, mb, MOB, gapLimit) == false)
+ return(0);
+
+ // Save a copy of the original matches
+ atacMatch macopy = *ma;
+ atacMatch mbcopy = *mb;
+
+ // Grab the sequences we use, and make the accesors
+ //
+ seqInCore *s1 = C1->getSequenceInCore(ma->iid1);
+ seqInCore *s2 = C2->getSequenceInCore(ma->iid2);
+
+ FastAAccessor mas1(s1, ma->fwd1 == false);
+ FastAAccessor mas2(s2, ma->fwd2 == false);
+ FastAAccessor mbs1(s1, mb->fwd1 == false);
+ FastAAccessor mbs2(s2, mb->fwd2 == false);
+
+ mas1.setRange(ma->pos1, ma->len1);
+ mas2.setRange(ma->pos2, ma->len2);
+ mbs1.setRange(mb->pos1, mb->len1);
+ mbs2.setRange(mb->pos2, mb->len2);
+
+ uint32 shifted = 0;
+
+ // We want to extend ma to the right, this will shift the gap to
+ // the right-most position (relative to the forward genomic).
+ //
+ // While there is a match after ma, extend ma to the right, and
+ // decrease mb from the left.
+ //
+ if (shiftRight == false) {
+
+ // Similar to above. The accessor hides most of the pain caused
+ // by reverse complement.
+
+ mas1.setPosition(ma->pos1 + ma->len1 - 1);
+ mas2.setPosition(ma->pos2 + ma->len2 - 1);
+
+ mbs1.setPosition(mb->pos1); --mbs1;
+ mbs2.setPosition(mb->pos2); --mbs2;
+
+
+#ifdef REPORT_DEBUG
+ // Dump out some sequence to see where we really are
+ //
+ fprintf(stderr, "A: ");
+ for (uint32 i=0; i<50; i++) {
+ fprintf(stderr, "%c", *mas1);
+ --mas1;
+ }
+ fprintf(stderr, "\n");
+
+ fprintf(stderr, "B: ");
+ for (uint32 i=0; i<50; i++) {
+ fprintf(stderr, "%c", *mbs1);
+ --mbs1;
+ }
+ fprintf(stderr, "\n");
+
+ // Reset the iterators
+ //
+ mas1.setPosition(ma->pos1 + ma->len1 - 1);
+ mas2.setPosition(ma->pos2 + ma->len2 - 1);
+
+ mbs1.setPosition(mb->pos1); --mbs1;
+ mbs2.setPosition(mb->pos2); --mbs2;
+#endif
+
+
+
+ while (mas1.isValid() &&
+ mas2.isValid() &&
+ mbs1.isValid() &&
+ mbs2.isValid() &&
+ (ma->len1 > 0) &&
+ (ma->len2 > 0) &&
+ (letterToBits[(int)*mbs1] != 0xff) &&
+ (letterToBits[(int)*mbs2] != 0xff) &&
+ IUPACidentity[(int)*mbs1][(int)*mbs2]) {
+
+#ifdef REPORT_SHIFTING
+ fprintf(stderr, "EXTENDrev: MA %c/%c ----- %c/%c MB\n",
+ *mas1, *mas2, *mbs1, *mbs2);
+#endif
+
+ mas1.extendRight(-1); ma->len1--; --mas1;
+ mas2.extendRight(-1); ma->len2--; --mas2;
+
+ mbs1.extendLeft(1); mb->len1++; --mbs1;
+ mbs2.extendLeft(1); mb->len2++; --mbs2;
+
+ shifted++;
+ }
+ } else {
+
+ // A wants to be the first thing after ma -- the first base in
+ // the gap. Set the position to the last thing in the range,
+ // then use the increment operator to extend past that. The spec
+ // on FastAAccessor says we can't directly go somewhere outside
+ // the range.
+ //
+ mas1.setPosition(ma->pos1 + ma->len1 - 1); ++mas1;
+ mas2.setPosition(ma->pos2 + ma->len2 - 1); ++mas2;
+
+ // B can be set to the first thing in the match with no problem.
+ //
+ mbs1.setPosition(mb->pos1);
+ mbs2.setPosition(mb->pos2);
+
+ // While we're still in sequence (isValid()) and we haven't
+ // obliterated the match we're shifting the gap into, and we can
+ // extend the other match (being both validSymbols and identity),
+ // shift the gap to the right.
+ //
+ while (mas1.isValid() &&
+ mas2.isValid() &&
+ mbs1.isValid() &&
+ mbs2.isValid() &&
+ (mb->len1 > 0) &&
+ (mb->len2 > 0) &&
+ (letterToBits[(int)*mas1]) &&
+ (letterToBits[(int)*mas2]) &&
+ IUPACidentity[(int)*mas1][(int)*mas2]) {
+
+#ifdef REPORT_SHIFTING
+ fprintf(stderr, "EXTENDfwd: MA %c/%c ----- %c/%c MB\n",
+ *mas1, *mas2, *mbs1, *mbs2);
+#endif
+
+ mas1.extendRight(1); ma->len1++; ++mas1;
+ mas2.extendRight(1); ma->len2++; ++mas2;
+
+ mbs1.extendLeft(-1); mb->len1--; ++mbs1;
+ mbs2.extendLeft(-1); mb->len2--; ++mbs2;
+
+ shifted++;
+ }
+ }
+
+ // Finally, update the two matches with the shifted results.
+
+ ma->pos1 = mas1.getRangeBegin();
+ ma->len1 = mas1.getRangeLength();
+ ma->pos2 = mas2.getRangeBegin();
+ ma->len2 = mas2.getRangeLength();
+ mb->pos1 = mbs1.getRangeBegin();
+ mb->len1 = mbs1.getRangeLength();
+ mb->pos2 = mbs2.getRangeBegin();
+ mb->len2 = mbs2.getRangeLength();
+
+
+ //
+ // The rest is just error checking.
+ //
+
+ if (shifted)
+ numShifted++;
+ else
+ numNotShifted++;
+
+ if (shifted < 1024)
+ amountShifted[shifted]++;
+
+ // leftmatch origend newend rightmatch origbegin newbegin
+ if (shifted && logFile) {
+ if (ma->fwd2) {
+ // Forward matches are easy.
+ //
+ fprintf(logFile, "%s\t%s\t%s:"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t->\t"uint32FMT"\t"uint32FMT"\t",
+ ma->matchuid, mb->matchuid,
+ AF.labelA(), ma->iid1,
+ macopy.pos1 + macopy.len1, ma->pos1 + ma->len1,
+ mbcopy.pos1, mb->pos1);
+ fprintf(logFile, "%s:"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t->\t"uint32FMT"\t"uint32FMT"\n",
+ AF.labelB(), ma->iid2,
+ macopy.pos2 + macopy.len2, ma->pos2 + ma->len2,
+ mbcopy.pos2, mb->pos2);
+ } else {
+ // Reverse matches are painful. The gap on B is between the
+ // right edge of mb, and the the left edge of ma.
+ //
+ fprintf(logFile, "%s\t%s\t%s:"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t->\t"uint32FMT"\t"uint32FMT"\t",
+ ma->matchuid, mb->matchuid,
+ AF.labelA(), ma->iid1,
+ macopy.pos1 + macopy.len1, ma->pos1 + ma->len1,
+ mbcopy.pos1, mb->pos1);
+ fprintf(logFile, "%s:"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t->\t"uint32FMT"\t"uint32FMT"\n",
+ AF.labelB(), ma->iid2,
+ mbcopy.pos2 + mbcopy.len2, mb->pos2 + mb->len2,
+ macopy.pos2, ma->pos2);
+ }
+ }
+
+
+#ifdef REPORT_RESULTS
+ if (shifted)
+ fprintf(stderr, "SHIFTED "uint32FMT" bases.\n", shifted);
+ else
+ fprintf(stderr, "NOT SHIFTED.\n");
+
+ fprintf(stderr, uint32FMTW(9)"-"uint32FMTW(9)" -- "uint32FMTW(9)"-"uint32FMTW(9)"\n",
+ macopy.pos1, macopy.pos1 + macopy.len1,
+ mbcopy.pos1, mbcopy.pos1 + mbcopy.len1);
+ fprintf(stderr, uint32FMTW(9)"-"uint32FMTW(9)" -- "uint32FMTW(9)"-"uint32FMTW(9)"\n",
+ macopy.pos2, macopy.pos2 + macopy.len2,
+ mbcopy.pos2, mbcopy.pos2 + mbcopy.len2);
+ fprintf(stderr, "shifted "uint32FMT" bases (fwd1=%d fwd2=%d fwd1=%d fwd2=%d)\n", shifted, ma->fwd1, ma->fwd2, mb->fwd1, mb->fwd2);
+ fprintf(stderr, uint32FMTW(9)"-"uint32FMTW(9)" -- "uint32FMTW(9)"-"uint32FMTW(9)"\n",
+ ma->pos1, ma->pos1 + ma->len1,
+ mb->pos1, mb->pos1 + mb->len1);
+ fprintf(stderr, uint32FMTW(9)"-"uint32FMTW(9)" -- "uint32FMTW(9)"-"uint32FMTW(9)"\n",
+ ma->pos2, ma->pos2 + ma->len2,
+ mb->pos2, mb->pos2 + mb->len2);
+#endif
+
+ uint32 errors = 0;
+
+ if (macopy.pos1 != ma->pos1)
+ fprintf(stderr, "WARNING: begin of assembly 1 moved!\n"), errors++;
+ if (mbcopy.pos1 + mbcopy.len1 != mb->pos1 + mb->len1)
+ fprintf(stderr, "WARNING: end of assembly 1 moved!\n"), errors++;
+
+ if ((ma->fwd2 == true) && (macopy.pos2 != ma->pos2))
+ fprintf(stderr, "WARNING: begin of assembly 2 moved!\n"), errors++;
+ if ((ma->fwd2 == false) && (mbcopy.pos2 != mb->pos2))
+ fprintf(stderr, "WARNING: begin of assembly 2 moved (rc)!\n"), errors++;
+
+ if ((ma->fwd2 == true) && (mbcopy.pos2 + mbcopy.len2 != mb->pos2 + mb->len2))
+ fprintf(stderr, "WARNING: end of assembly 1 moved!\n"), errors++;
+ if ((ma->fwd2 == false) && (macopy.pos2 + macopy.len2 != ma->pos2 + ma->len2))
+ fprintf(stderr, "WARNING: end of assembly 2 moved (rc)!\n"), errors++;
+
+ // For debugging, claim there were errors if we shifted something.
+#ifdef REPORT_RESULTS
+ errors++;
+#endif
+
+ if (errors > 0) {
+ atacMatch *l = 0L;
+ atacMatch *r = 0L;
+ char str1[1024];
+ char str2[1024];
+ char str3[1024];
+
+ // Print the sequence. We could print each piece separately
+ // (and, indeed, we tried that initially) but that's difficult
+ // because we need to remember which match is first on B,
+
+ macopy.print(stderr, "A", "B");
+ mbcopy.print(stderr, "A", "B");
+ l = &macopy;
+ r = &mbcopy;
+
+ getSequenceEnd(str1, l->pos1, l->len1, mas1);
+ getSequenceAll(str2, l->pos1 + l->len1, r->pos1 - l->pos1 - l->len1, mas1);
+ getSequenceBeg(str3, r->pos1, r->len1, mas1);
+ fprintf(stderr, "SEQA: %s -- %s -- %s\n", str1, str2, str3);
+
+ if (macopy.fwd2) {
+ // We're forward, so l is really first on B.
+ getSequenceEnd(str1, l->pos2, l->len2, mas2);
+ getSequenceAll(str2, l->pos2 + l->len2, r->pos2 - l->pos2 - l->len2, mas2);
+ getSequenceBeg(str3, r->pos2, r->len2, mas2);
+ } else {
+ // Nope, reverse complement, so r is really first on B. This
+ // only changes how we get the gap.
+ //
+ getSequenceEnd(str1, l->pos2, l->len2, mas2);
+ getSequenceAll(str2, r->pos2 + r->len2, l->pos2 - r->pos2 - r->len2, mas2);
+ getSequenceBeg(str3, r->pos2, r->len2, mas2);
+ }
+ fprintf(stderr, "SEQB: %s -- %s -- %s\n", str1, str2, str3);
+
+
+ // Do the same thing (same getSequence calls) for the after picture.
+
+ ma->print(stderr, "A", "B");
+ mb->print(stderr, "A", "B");
+ l = ma;
+ r = mb;
+
+ getSequenceEnd(str1, l->pos1, l->len1, mas1);
+ getSequenceAll(str2, l->pos1 + l->len1, r->pos1 - l->pos1 - l->len1, mas1);
+ getSequenceBeg(str3, r->pos1, r->len1, mas1);
+ fprintf(stderr, "SEQA: %s -- %s -- %s\n", str1, str2, str3);
+
+ if (ma->fwd2) {
+ getSequenceEnd(str1, l->pos2, l->len2, mas2);
+ getSequenceAll(str2, l->pos2 + l->len2, r->pos2 - l->pos2 - l->len2, mas2);
+ getSequenceBeg(str3, r->pos2, r->len2, mas2);
+ } else {
+ getSequenceEnd(str1, l->pos2, l->len2, mas2);
+ getSequenceAll(str2, r->pos2 + r->len2, l->pos2 - r->pos2 - r->len2, mas2);
+ getSequenceBeg(str3, r->pos2, r->len2, mas2);
+ }
+ fprintf(stderr, "SEQB: %s -- %s -- %s\n", str1, str2, str3);
+ }
+
+ //if (errors)
+ // exit(1);
+
+ return(shifted);
+}
+
+
+
+
+
+int
+main(int argc, char *argv[]) {
+
+ if (argc == 1) {
+ fprintf(stderr, "usage: %s [options] < matches > matches\n", argv[0]);
+ fprintf(stderr, " Instead of the usual switch based options to enable behavior\n");
+ fprintf(stderr, " gapShifter iterates of a list of shift directions and sizes.\n");
+ fprintf(stderr, " l -- shift gaps to the left\n");
+ fprintf(stderr, " r -- shift gaps to the right\n");
+ fprintf(stderr, " # -- set the maximum size of a gap to shift\n");
+ fprintf(stderr, " log x -- open a logfile 'x' for results of the next shift\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " for example\n");
+ fprintf(stderr, " gapShifter 1 l r l r 10 l r l log X r < some.atac > shifted.atac\n");
+ fprintf(stderr, " would shift 1bp gaps to the left, then to the right, then left,\n");
+ fprintf(stderr, " then set the gap size to 10bp and repeat. The last shift is logged\n");
+ fprintf(stderr, " into fle 'X'.\n");
+ fprintf(stderr, " \n");
+ fprintf(stderr, " This is useful since shifting gaps can obliterate matches, but possibly.\n");
+ fprintf(stderr, " when both left and right shifts are used.\n");
+ fprintf(stderr, " GCTAATTAGACG\n");
+ fprintf(stderr, " GCT-AT-AGACG\n");
+ fprintf(stderr, " The second gap can be shifted to the left, and the first gap can be\n");
+ fprintf(stderr, " shifted right, resulting in\n");
+ fprintf(stderr, " GCTAATTAGACG\n");
+ fprintf(stderr, " GCTA--TAGACG\n");
+ fprintf(stderr, " Thus, two one base gaps were merged into a two base gap, which might\n");
+ fprintf(stderr, " then be able to be shifted. e.g.:\n");
+ fprintf(stderr, " atgatcatcttatc\n");
+ fprintf(stderr, " at---c-t--tatc\n");
+ exit(1);
+ }
+
+ atacFile AF("-");
+ atacMatchList &ML = *AF.matches();
+ atacMatchOrder MOA(ML);
+ atacMatchOrder MOB(ML);
+
+ MOA.sortA();
+ MOB.sortB();
+
+ // second to last == loadAll
+ // last == report loading
+ //
+ seqCache *C1 = new seqCache(AF.assemblyFileA(), 2, false);
+ seqCache *C2 = new seqCache(AF.assemblyFileB(), 1024, false);
+
+ bool shiftRight = true;
+ uint32 gapLimit = 5;
+
+ char *logFileName = 0L;
+
+ int arg=1;
+ while (arg < argc) {
+ bool doShift = false;
+
+ if (strcmp(argv[arg], "log") == 0) {
+ logFileName = argv[++arg];
+ errno = 0;
+ logFile = fopen(logFileName, "w");
+ if (errno)
+ fprintf(stderr, "gapShifter: can't open log file '%s': %s\n", logFileName, strerror(errno)), exit(1);
+ } else if (strcmp(argv[arg], "l") == 0) {
+ shiftRight = false;
+ doShift = true;
+ } else if (strcmp(argv[arg], "r") == 0) {
+ shiftRight = true;
+ doShift = true;
+ } else {
+ gapLimit = strtouint32(argv[arg], 0L);
+ }
+
+ if (doShift) {
+
+ for (uint32 x=0; x<1024; x++)
+ amountShifted[x] = 0;
+
+ numShifted = 0;
+ numNotShifted = 0;
+ numDiffSeq = 0;
+ numDiffOri = 0;
+ numZeroLen = 0;
+ numOutOfOrder = 0;
+ numNotAdjacent = 0;
+ numNoGap = 0;
+ numGapTooBig = 0;
+ numOverlapping = 0;
+
+ fprintf(stderr, "Shifting gaps of length at most "uint32FMT" bases, to the %s.\n", gapLimit, (shiftRight) ? "right" : "left");
+
+ uint32 gapsShifted = 0;
+ for (uint32 i=1; i<ML.numMatches(); i++) {
+
+ if (shiftGap(AF, ML, MOA[i-1], MOA[i], MOB, C1, C2, gapLimit, shiftRight)) {
+ gapsShifted++;
+ //fprintf(stderr, "shifted "uint32FMT" out of "uint32FMT" (%6.2f%%)\r", gapsShifted, i, (double)gapsShifted / (double)i * 100.0);
+ //fflush(stderr);
+ }
+ }
+
+ fprintf(stderr, "numShifted = "uint32FMT"\n", numShifted);
+ fprintf(stderr, "numNotShifted = "uint32FMT"\n", numNotShifted);
+ fprintf(stderr, "numDiffSeq = "uint32FMT"\n", numDiffSeq);
+ fprintf(stderr, "numDiffOri = "uint32FMT"\n", numDiffOri);
+ fprintf(stderr, "numZeroLen = "uint32FMT"\n", numZeroLen);
+ fprintf(stderr, "numOutOfOrder = "uint32FMT"\n", numOutOfOrder);
+ fprintf(stderr, "numNotAdjacent = "uint32FMT"\n", numNotAdjacent);
+ fprintf(stderr, "numNoGap = "uint32FMT"\n", numNoGap);
+ fprintf(stderr, "numGapTooBig = "uint32FMT"\n", numGapTooBig);
+ fprintf(stderr, "numOverlapping = "uint32FMT"\n", numOverlapping);
+
+ for (uint32 x=0; x<50; x++)
+ fprintf(stderr, "amountShifted["uint32FMT"] = "uint32FMT" (number of gaps shifted by [number of bases])\n", x, amountShifted[x]);
+
+ fprintf(stderr, "shifted "uint32FMT" out of "uint32FMT" (%6.2f%%)\n", gapsShifted, ML.numMatches(), (double)gapsShifted / (double)ML.numMatches() * 100.0);
+
+ if (logFile) {
+ fclose(logFile);
+ logFileName = 0L;
+ logFile = 0L;
+ }
+ }
+
+ arg++;
+ }
+
+
+ for (uint32 i=0; i<ML.numMatches(); i++) {
+ atacMatch *ma = ML[i];
+ if ((ma->len1 > 0) && (ma->len2 > 0))
+ fprintf(stdout, "M u %s %s %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" %d\n",
+ ma->matchuid, ma->parentuid,
+ AF.labelA(), ma->iid1, ma->pos1, ma->len1,
+ AF.labelB(), ma->iid2, ma->pos2, ma->len2, ma->fwd2 ? 1 : -1);
+ }
+
+ return(0);
+}
diff --git a/atac-driver/gapShifter/projectFeatures-test-cases/test-rev.atac b/atac-driver/gapShifter/projectFeatures-test-cases/test-rev.atac
new file mode 100644
index 0000000..51a622d
--- /dev/null
+++ b/atac-driver/gapShifter/projectFeatures-test-cases/test-rev.atac
@@ -0,0 +1,2 @@
+!format atac 1.0
+M u m2 r2 A:0 100 200 1 B:0 500 200 -1
diff --git a/atac-driver/gapShifter/projectFeatures-test-cases/test.atac b/atac-driver/gapShifter/projectFeatures-test-cases/test.atac
new file mode 100644
index 0000000..07f9ed2
--- /dev/null
+++ b/atac-driver/gapShifter/projectFeatures-test-cases/test.atac
@@ -0,0 +1,2 @@
+!format atac 1.0
+M u m1 r1 A:0 100 200 1 B:0 500 200 1
diff --git a/atac-driver/gapShifter/projectFeatures-test-cases/test.atac.log b/atac-driver/gapShifter/projectFeatures-test-cases/test.atac.log
new file mode 100644
index 0000000..f6a1113
--- /dev/null
+++ b/atac-driver/gapShifter/projectFeatures-test-cases/test.atac.log
@@ -0,0 +1,10 @@
+At gapSize=1000 closed 0 f-gaps and 0 r-gaps.
+At gapSize=1000 closed 0 f-gaps and 0 r-gaps.
+At gapSize=1000 closed 0 f-gaps and 0 r-gaps.
+At gapSize=1000 closed 0 f-gaps and 0 r-gaps.
+At gapSize=1000 closed 0 f-gaps and 0 r-gaps.
+At gapSize=1000 closed 0 f-gaps and 0 r-gaps.
+At gapSize=1000 closed 0 f-gaps and 0 r-gaps.
+At gapSize=1000 closed 0 f-gaps and 0 r-gaps.
+At gapSize=1000 closed 0 f-gaps and 0 r-gaps.
+At gapSize=1000 closed 0 f-gaps and 0 r-gaps.
diff --git a/atac-driver/gapShifter/projectFeatures-test-cases/test.ataf b/atac-driver/gapShifter/projectFeatures-test-cases/test.ataf
new file mode 100644
index 0000000..4930b0d
--- /dev/null
+++ b/atac-driver/gapShifter/projectFeatures-test-cases/test.ataf
@@ -0,0 +1,23 @@
+! format atac 1.0
+F tr test01 . A:0 0 50
+
+# Test containment
+F tr test02 . A:0 100 200
+F tr test03 . A:0 100 150
+F tr test04 . A:0 150 100
+F tr test05 . A:0 250 150
+
+# Test before
+F tr test06 . A:0 50 40
+F tr test07 . A:0 50 50
+F tr test08 . A:0 50 150
+F tr test09 . A:0 50 250
+
+# Test after
+F tr test10 . A:0 100 250
+F tr test11 . A:0 290 50
+F tr test12 . A:0 300 50
+F tr test13 . A:0 310 50
+
+# Match contained in feature
+F tr test14 . A:0 50 400
diff --git a/atac-driver/gapShifter/projectFeatures-test-cases/test.log b/atac-driver/gapShifter/projectFeatures-test-cases/test.log
new file mode 100644
index 0000000..e69de29
diff --git a/atac-driver/gapShifter/projectFeatures.C b/atac-driver/gapShifter/projectFeatures.C
new file mode 100644
index 0000000..e3728fe
--- /dev/null
+++ b/atac-driver/gapShifter/projectFeatures.C
@@ -0,0 +1,208 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+void
+usage(char *name) {
+ fprintf(stderr, "usage: %s [] -m matches -l log\n", name);
+ fprintf(stderr, " When it works, fill this in...\n");
+}
+
+// Reads an atac mapping, and a list of features. Features on one
+// axis are projected to the other axis using the atac map.
+
+int
+main(int argc, char **argv) {
+ char *matchesFile = 0L;
+ char *featureFile = 0L;
+ FILE *logFile = 0L;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-m") == 0) {
+ matchesFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-f") == 0) {
+ featureFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-l") == 0) {
+ errno = 0;
+ logFile = fopen(argv[++arg], "w");
+ if (errno)
+ fprintf(stderr, "Failed to open logfile '%s': %s\n", strerror(errno), argv[arg]), exit(1);
+ } else {
+ usage(argv[0]);
+ exit(1);
+ }
+ arg++;
+ }
+
+ if (matchesFile == 0L)
+ usage(argv[0]), exit(1);
+ if (featureFile == 0L)
+ usage(argv[0]), exit(1);
+ if (logFile == 0L)
+ usage(argv[0]), exit(1);
+
+ atacFile AF(matchesFile);
+ atacMatchList &ML = *AF.matches();
+ atacMatchOrder MO(ML);
+
+
+ // XXXX extrabroken!
+#warning BROKEN
+ atacFeatureList FL;
+
+ // Project features from A to B.
+ MO.sortA();
+ FL.sort();
+
+ uint32 mid = 0;
+ uint32 fid = 0;
+ uint32 pid = 0;
+
+ while ((mid < MO.numberOfMatches()) &&
+ (fid < FL.numberOfFeatures())) {
+ atacMatch *m = MO[mid];
+ atacFeature *f = FL[fid];
+
+ if (m->iid1 < f->iid) {
+ mid++;
+ continue;
+ }
+ if (f->iid < m->iid1) {
+ fid++;
+ continue;
+ }
+
+ // Same sequences now!
+
+ if (m->pos1 + m->len1 < f->pos) {
+ // match ends before the feature
+ mid++;
+ continue;
+ }
+
+ if (f->pos + f->len < m->pos1) {
+ // Feature begins before match
+ fid++;
+ continue;
+ }
+
+ // Feature and match now overlap!
+
+
+ //
+ // This does A -> B -- ONLY.
+ //
+
+
+
+ // If feature is completely in match, this is easy.
+ //
+ if ((m->pos1 <= f->pos) && ((f->pos + f->len) <= (m->pos1 + m->len1))) {
+ uint32 beg;
+
+ if (m->fwd2 == true) {
+ beg = m->pos2 + f->pos - m->pos1;
+ } else {
+ beg = m->pos2 + m->len2 - (f->pos - m->pos1) - f->len;
+ }
+
+ if (f->len > 0)
+ fprintf(stdout, "M u Aprojected"uint32FMT" %s.%s %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" %d\n",
+ pid,
+ f->featureuid, m->matchuid,
+ AF.labelA(), f->iid, f->pos, f->len,
+ AF.labelB(), m->iid2, beg, f->len, (m->fwd2) ? 1 : -1);
+ pid++;
+ fid++;
+ continue;
+ }
+
+ // If match is completely within feature, super easy!
+ //
+ if ((f->pos < m->pos1) && (m->pos1 + m->len1) < (f->pos + f->len)) {
+ if (m->len1 > 0)
+ fprintf(stdout, "M u Bprojected"uint32FMT" %s.%s %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" %d\n",
+ pid,
+ f->featureuid, m->matchuid,
+ AF.labelA(), m->iid1, m->pos1, m->len1,
+ AF.labelB(), m->iid2, m->pos2, m->len2, (m->fwd2) ? 1 : -1);
+ pid++;
+ fid++;
+ continue;
+ }
+
+
+ // Dang, feature isn't completely in match. Guess where feature
+ // could be ending? Or just project as much as possible?
+
+ if (f->pos < m->pos1) {
+ uint32 len = f->len - (m->pos1 - f->pos);
+ uint32 beg;
+
+ if (m->fwd2 == true) {
+ beg = m->pos2;
+ } else {
+ beg = m->pos2 + m->len2 - len;
+ }
+
+ if (len > 0)
+ fprintf(stdout, "M u Cprojected"uint32FMT" %s.%s %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" %d\n",
+ pid,
+ f->featureuid, m->matchuid,
+ AF.labelA(), f->iid, m->pos1, len,
+ AF.labelB(), m->iid2, beg, len, (m->fwd2) ? 1 : -1);
+ pid++;
+ fid++;
+ continue;
+ }
+
+ if (m->pos1 + m->len1 < f->pos + f->len) {
+ uint32 len = m->pos1 + m->len1 - f->pos;
+ uint32 beg;
+
+ if (m->fwd2 == true) {
+ beg = m->pos2 + m->len2 - len;
+ } else {
+ beg = m->pos2;
+ }
+
+ if (len > 0)
+ fprintf(stdout, "M u Dprojected"uint32FMT" %s.%s %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" %d\n",
+ pid,
+ f->featureuid, m->matchuid,
+ AF.labelA(), f->iid, f->pos, len,
+ AF.labelB(), m->iid2, beg, len, (m->fwd2) ? 1 : -1);
+ pid++;
+ fid++;
+ continue;
+ }
+
+ fprintf(stderr, "projectFeatures: Unhandled case?\n");
+ m->print(stdout, "A", "B");
+ f->print(stdout, "A");
+
+ assert(0);
+ }
+}
diff --git a/atac-driver/gapShifter/testAtac.C b/atac-driver/gapShifter/testAtac.C
new file mode 100644
index 0000000..97afcdd
--- /dev/null
+++ b/atac-driver/gapShifter/testAtac.C
@@ -0,0 +1,107 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+// Reads a set of atac matches, computes the percent identity of the
+// regions, and warns if any identites are low.
+
+void
+usage(char *name) {
+ fprintf(stderr, "usage: %s [-d identity] [-i identity] -m matches\n", name);
+ fprintf(stderr, " -i print a warning if a match is below this percent identity\n");
+}
+
+int
+main(int argc, char *argv[]) {
+ char *matchesFile = 0L;
+ double identityLimit = 0.9;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-m") == 0) {
+ matchesFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-i") == 0) {
+ identityLimit = atof(argv[++arg]);
+ if (identityLimit > 1.0)
+ identityLimit /= 100;
+ } else {
+ usage(argv[0]);
+ exit(1);
+ }
+ arg++;
+ }
+
+ if (matchesFile == 0L)
+ usage(argv[0]), exit(1);
+
+ atacFile AF(matchesFile);
+ atacMatchList &ML = *AF.matches();
+ seqCache Acache(AF.assemblyFileA(), 0, false);
+ seqCache Bcache(AF.assemblyFileB(), 0, false);
+
+ Acache.loadAllSequences();
+ Bcache.loadAllSequences();
+
+ for (uint32 i=0; i<ML.numMatches(); i++) {
+ atacMatch *m = ML.getMatch(i);
+
+ uint32 identities = 0;
+
+ char *a = Acache.getSequenceInCore(m->iid1)->sequence() + m->pos1;
+ char *b = Bcache.getSequenceInCore(m->iid2)->sequence() + m->pos2;
+
+ if (m->fwd2) {
+ for (uint32 p=0; p<m->len1; p++) {
+ if (toUpper[(int)a[p]] == toUpper[(int)b[p]])
+ identities++;
+ }
+ } else {
+ for (uint32 p=0, q=m->len2-1; p<m->len1; p++, q--) {
+ if (toUpper[(int)a[p]] == toUpper[complementSymbol[(int)b[q]]])
+ identities++;
+ }
+ }
+
+ double myIdentity = (double)identities / m->len1;
+
+ if (myIdentity < identityLimit) {
+ fprintf(stderr, "match "uint32FMT" is only %6.2f%% identity: ",
+ i, 100.0 * identities / m->len1);
+ m->print(stderr, AF.labelA(), AF.labelB());
+ if (m->len1 < 200) {
+ char tmp[1000];
+
+ strncpy(tmp, a, m->len1);
+ tmp[m->len1] = 0;
+ fprintf(stderr, " %s\n", tmp);
+
+ strncpy(tmp, b, m->len1);
+ tmp[m->len1] = 0;
+ fprintf(stderr, " %s\n", tmp);
+ }
+ }
+ }
+
+ return(0);
+}
diff --git a/atac-driver/interscaffold-gaps.pl b/atac-driver/interscaffold-gaps.pl
new file mode 100644
index 0000000..f02b88b
--- /dev/null
+++ b/atac-driver/interscaffold-gaps.pl
@@ -0,0 +1,134 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $atacFile = undef;
+my $minLen = 10000;
+my $maxChr = 24;
+my $reference = 0;
+
+while (scalar(@ARGV)) {
+ my $arg = shift @ARGV;
+
+ if ($arg eq "-m") {
+ $minLen = shift @ARGV;
+ } elsif ($arg eq "-c") {
+ $maxChr = shift @ARGV;
+ } elsif ($arg eq "-a") {
+ $atacFile = shift @ARGV;
+ } elsif ($arg eq "-A") {
+ $reference = 0;
+ } elsif ($arg eq "-B") {
+ $reference = 1;
+ } else {
+ die "Invalid option '$arg'\n";
+ }
+}
+
+if (!defined($atacFile)) {
+ print STDERR "usage: $0 [-m minlen] [-c maxchr] [-A | -B] -a <some.atac.ckpLast>\n";
+ print STDERR " -m m Include matches larger than 'm'. Default: 10000\n";
+ print STDERR " -c c Include chromosomes in the reference below 'c'. Default: 24 (1-22+X+Y)\n";
+ print STDERR " -A | -B The reference genome is sequence A (B).\n";
+ print STDERR " -a x Process matches from atac file 'x'\n";
+ exit(1);
+}
+
+#if (! -e "$atacFile.gaps.sorted") {
+ open(F, "< $atacFile");
+ open(G, "| sort -k1n -k2n > $atacFile.gaps.sorted");
+ while (<F>) {
+ if (m/^M\sr\s/) {
+ my @vals = split '\s+', $_;
+ (undef, $vals[4]) = split ':', $vals[4];
+ (undef, $vals[8]) = split ':', $vals[8];
+
+ if (($reference == 0) && ($vals[4] < $maxChr) && ($vals[6] > $minLen)) {
+ print G "$vals[4] $vals[5] $vals[6] - $vals[8] $vals[9] $vals[10] - $vals[11]\n";
+ }
+ if (($reference == 1) && ($vals[8] < $maxChr) && ($vals[10] > $minLen)) {
+ print G "$vals[8] $vals[9] $vals[10] - $vals[4] $vals[5] $vals[6] - $vals[11]\n";
+ }
+ }
+ }
+ close(F);
+ close(G);
+#}
+
+
+my $lastscf = -1;
+my @chr;
+my @pos;
+my @len;
+my @scf;
+
+open(F, "< $atacFile.gaps.sorted");
+while (<F>) {
+ my @vals = split '\s+', $_;
+ push @chr, $vals[0];
+ push @pos, $vals[1];
+ push @len, $vals[2];
+ push @scf, $vals[4];
+}
+close(F);
+
+push @scf, -1;
+push @scf, -1;
+
+my $num = scalar(@chr) - 1;
+
+# We compute stats on the distance between elements i and j
+#
+my $i = 0;
+my $j = 1;
+
+print "GAPS\n";
+
+while ($i < $num) {
+
+ $j = $i+1;
+
+ if ($chr[$i] != $chr[$j]) {
+ $i++;
+ next;
+ }
+
+ again:
+
+ # Move j ahead if it's an interleaved scaffold
+ #
+ # If our current scaffold is interleaved by someone else,
+ # skip that someone else. Yes, the end result of this
+ # is to have i and j point to the same scaffold.
+ #
+ if (($chr[$i] == $chr[$j]) &&
+ ($scf[$i] == $scf[$j+1]) &&
+ ($len[$j] < 5000)) {
+ $j++;
+ }
+
+ # Move j ahead if it's the same scaffold
+ #
+ if (($chr[$i] == $chr[$j]) &&
+ ($scf[$i] == $scf[$j])) {
+ $i = $j;
+ $j++;
+ goto again;
+ }
+
+
+
+ # Report, if begin and end are on the same chromosome.
+ if ($chr[$i] == $chr[$j]) {
+ my $aend = $pos[$i] + $len[$i];
+ my $bsta = $pos[$j];
+ my $gapl = $bsta - $aend;
+
+ if ($gapl > 0) {
+ print "GAP: $gapl -- $i ($chr[$i] $pos[$i] $len[$i] $scf[$i]) -- $j ($chr[$j] $pos[$j] $len[$j] $scf[$j])\n";
+ }
+ }
+
+ $i = $j;
+}
+
diff --git a/atac-driver/lengthFilter/Make.include b/atac-driver/lengthFilter/Make.include
new file mode 100644
index 0000000..825b275
--- /dev/null
+++ b/atac-driver/lengthFilter/Make.include
@@ -0,0 +1,16 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBSEQ/ :=$(realpath $/../../libseq/)/
+LIBATAC/ :=$(realpath $/../libatac/)/
+
+$/.CXX_SRCS := $/lengthFilter.C
+$/.CXX_EXES := $/lengthFilter
+
+$/.CLEAN :=$/*.o $/*~ $/core
+
+$/lengthFilter: $/lengthFilter.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBATAC/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/atac-driver/lengthFilter/lengthFilter.C b/atac-driver/lengthFilter/lengthFilter.C
new file mode 100644
index 0000000..a5de896
--- /dev/null
+++ b/atac-driver/lengthFilter/lengthFilter.C
@@ -0,0 +1,118 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "util++.H"
+
+// Filters out matches that are too short.
+//
+// Original implementation in Python by Clark Mobarry.
+
+void
+readHeader(char *inLine, FILE *in, uint32 &minLength, FILE *out) {
+ bool printedLength = false;
+
+ fgets(inLine, 1024, in);
+ while (!feof(in) && (inLine[0] != 'M')) {
+
+ if (strncmp(inLine, "/globalMatchMinSize", 18) == 0) {
+ if (minLength > 0) {
+ // Skip any whitespace, the =, and more whitespace. Copy.
+ char *tmp = inLine + 14;
+ while (isspace(*tmp)) tmp++;
+ while (*tmp == '=') tmp++;
+ while (isspace(*tmp)) tmp++;
+ minLength = strtouint32(tmp, 0L);
+ }
+ sprintf(inLine, "/globalMatchMinSize="uint32FMT"\n", minLength);
+ printedLength = true;
+ }
+
+ if (out)
+ fputs(inLine, out);
+
+ fgets(inLine, 1024, in);
+ }
+
+ if (printedLength == false)
+ fprintf(stdout, "/globalMatchMinSize="uint32FMT"\n", minLength);
+
+ if (minLength == 0) {
+ fprintf(stderr, "I didn't find /globalMatchMinSize, please set it with -l\n");
+ exit(1);
+ }
+}
+
+
+int
+main(int argc, char **argv) {
+ char inLine[1024] = {0};
+ uint32 minLength = 0;
+ uint32 totalDumped = 0;
+ uint32 totalDumpedLength = 0;
+ uint32 totalSaved = 0;
+ uint32 totalSavedLength = 0;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-l") == 0) {
+ minLength = strtouint32(argv[++arg], 0L);
+ } else {
+ fprintf(stderr, "usage: %s [-h] [-l length] < matches.atac > matches.atac\n", argv[0]);
+ fprintf(stderr, " filters out all matches less than 'length' long.\n");
+ exit(1);
+ }
+ arg++;
+ }
+
+ readHeader(inLine, stdin, minLength, stdout);
+
+ // we need to parse the header to get globalMatchMinSize,
+ // and then let the command line override it. just make
+ // a custom readHeader() for here, do it there. nothing
+ // difficult.
+
+ while (!feof(stdin)) {
+ if (inLine[0] == 'M') {
+ splitToWords S(inLine);
+
+ if ((strtouint32(S[ 6], 0L) >= minLength) &&
+ (strtouint32(S[10], 0L) >= minLength)) {
+ totalSaved++;
+ totalSavedLength += strtouint32(S[ 6], 0L);
+ fputs(inLine, stdout);
+ } else {
+ totalDumped++;
+ totalDumpedLength += strtouint32(S[ 6], 0L);
+ }
+ } else {
+ fputs(inLine, stdout);
+ }
+
+ fgets(inLine, 1024, stdin);
+ }
+
+ fprintf(stderr, "lengthFilter: Discarded "uint32FMTW(8)" matches with total length "uint32FMTW(10)", %7.3f%% of the sequence in matches.\n",
+ totalDumped, totalDumpedLength, (double)totalDumpedLength / (totalDumpedLength + totalSavedLength) * 100.0);
+ fprintf(stderr, "lengthFilter: Saved "uint32FMTW(8)" matches with total length "uint32FMTW(10)", %7.3f%% of the sequence in matches.\n",
+ totalSaved, totalSavedLength, (double)totalSavedLength / (totalDumpedLength + totalSavedLength) * 100.0);
+}
diff --git a/atac-driver/libatac/Make.include b/atac-driver/libatac/Make.include
new file mode 100644
index 0000000..2416f0e
--- /dev/null
+++ b/atac-driver/libatac/Make.include
@@ -0,0 +1,30 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBSEQ/ :=$(realpath $/../../libseq/)/
+
+$/.CXX_INCS := $/atac.H \
+ $/atacFeature.H \
+ $/atacFeatureList.H \
+ $/atacMatch.H \
+ $/atacMatchList.H \
+ $/atacMatchOrder.H
+
+$/.CXX_SRCS := $/atacFeature.C \
+ $/atacFeatureList.C \
+ $/atacFile.C \
+ $/atacFileStreamMerge.C \
+ $/atacMatch.C \
+ $/atacMatchList.C \
+ $/atacMatchOrder.C
+
+$/.CXX_LIBS := $/libatac.a
+
+$/.CLEAN :=$/*.o $/*~
+
+$/libatac.a: ${$/.CXX_SRCS:.C=.o}
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBBIO/} -I${LIBSEQ/} -I${LIBSEQ/} -I${LIBUTL/})
+
+
diff --git a/atac-driver/libatac/atac.H b/atac-driver/libatac/atac.H
new file mode 100644
index 0000000..f682e7f
--- /dev/null
+++ b/atac-driver/libatac/atac.H
@@ -0,0 +1,130 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef ATAC_COMMON_H
+#define ATAC_COMMON_H
+
+#include <string>
+#include <map>
+using namespace std;
+
+#include "bio++.H"
+
+#include "seqCache.H"
+
+#include "atacMatch.H"
+#include "atacMatchList.H"
+#include "atacMatchOrder.H"
+
+#include "atacFeature.H"
+#include "atacFeatureList.H"
+
+#include "fasta-accessor.H"
+
+class atacFileBase {
+public:
+ atacFileBase();
+ virtual ~atacFileBase();
+
+ char *labelA(void) { return(_labelA); };
+ char *labelB(void) { return(_labelB); };
+
+ char *assemblyFileA(void) { return(_fileA); };
+ char *assemblyFileB(void) { return(_fileB); };
+
+ seqCache *fastaA(void) { return(_seqA); };
+ seqCache *fastaB(void) { return(_seqB); };
+
+protected:
+ void readHeader(char *inLine, FILE *in);
+public:
+ void writeHeader(FILE *out);
+
+protected:
+ char _fileA[1024]; // The name of our genome files
+ char _fileB[1024];
+
+ char _labelA[256]; // The label of each of the sequences
+ char _labelB[256];
+
+ map<string,string> _params;
+
+ seqCache *_seqA;
+ seqCache *_seqB;
+};
+
+
+class atacFileStream : public atacFileBase {
+public:
+ atacFileStream(char const *filename);
+ ~atacFileStream();
+
+ atacMatch *nextMatch(char type);
+ atacFeature *nextFeature(char type[4]);
+
+private:
+ FILE *_inFile;
+ char _inLine[1024];
+
+ uint32 _theMatchIID;
+ uint32 _theFeatureIID;
+
+ atacMatch _theMatch;
+ atacFeature _theFeature;
+};
+
+
+class afsm;
+class atacFileStreamMerge : public atacFileBase {
+public:
+ atacFileStreamMerge(void);
+ ~atacFileStreamMerge();
+
+ void addFile(char const *filename);
+
+ atacMatch *nextMatch(char type);
+ atacFeature *nextFeature(char type[4]);
+
+ void writeHeader(FILE *out);
+
+private:
+ uint32 _filesLen;
+ uint32 _filesMax;
+ afsm *_files;
+
+ uint32 _theMatchIID;
+ uint32 _theFeatureIID;
+};
+
+
+class atacFile : public atacFileBase {
+public:
+ atacFile(char const *filename);
+ ~atacFile();
+
+ atacMatchList *matches(void) { return(&_matches); };
+ atacMatchList *runs(void) { return(&_runs); };
+ atacMatchList *clumps(void) { return(&_clumps); };
+
+private:
+ atacMatchList _matches;
+ atacMatchList _runs;
+ atacMatchList _clumps;
+};
+
+#endif // ATAC_COMMON_H
diff --git a/atac-driver/libatac/atacFeature.C b/atac-driver/libatac/atacFeature.C
new file mode 100644
index 0000000..837b3e0
--- /dev/null
+++ b/atac-driver/libatac/atacFeature.C
@@ -0,0 +1,123 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005, 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+
+static
+uint32
+decodeAtacName(char *atac,
+ char *label) {
+ if (label) {
+ while (*atac && (*atac != ':'))
+ *label++ = *atac++;
+ *label = 0;
+ } else {
+ while (*atac && (*atac != ':'))
+ atac++;
+ }
+ if (*atac)
+ return(strtouint32(atac+1, 0L));
+ return(~uint32ZERO);
+}
+
+
+atacFeature::atacFeature(char *line) {
+ decode(line);
+}
+
+
+atacFeature::atacFeature(char *fuid,
+ char *puid,
+ uint32 fiid,
+ char *t,
+ uint32 i, uint32 p, uint32 l) {
+
+ strcpy(featureuid, fuid);
+ strcpy(parentuid, puid);
+
+ featureuid[15] = 0;
+ parentuid[15] = 0;
+
+ featureiid = fiid;
+
+ type[0] = 0;
+ type[1] = 0;
+ type[2] = 0;
+ type[3] = 0;
+ strcpy(type, t);
+
+ iid = i;
+ pos = p;
+ len = l;
+}
+
+
+void
+atacFeature::decode(char *line) {
+
+ splitToWords W(line);
+
+ strcpy(featureuid, W[2]);
+ strcpy(parentuid, W[3]);
+
+ featureuid[15] = 0;
+ parentuid[15] = 0;
+
+ featureiid = 0;
+
+ type[0] = 0;
+ type[1] = 0;
+ type[2] = 0;
+ type[3] = 0;
+ strcpy(type, W[1]);
+
+ iid = decodeAtacName(W[4], 0L);
+ pos = strtouint32(W[5], 0L);
+ len = strtouint32(W[6], 0L);
+}
+
+
+bool
+atacFeature::sanity(seqCache *A, char *inLine) {
+
+ bool featureOK = true;
+
+ if (A) {
+ if ((pos) > A->getSequenceLength(iid) || (pos + len) > A->getSequenceLength(iid)) {
+ chomp(inLine);
+ fprintf(stderr, "Feature longer than sequence (by "uint32FMT"bp): seqLen="uint32FMTW(8)" %s\n",
+ pos + len - A->getSequenceLength(iid),
+ A->getSequenceLength(iid), inLine);
+ featureOK = false;
+ }
+
+ if (iid >= A->getNumberOfSequences()) {
+ chomp(inLine);
+ fprintf(stderr, "Feature references invalid sequence iid: %s\n", inLine);
+ featureOK = false;
+ }
+ }
+
+ return(featureOK);
+}
diff --git a/atac-driver/libatac/atacFeature.H b/atac-driver/libatac/atacFeature.H
new file mode 100644
index 0000000..10a7899
--- /dev/null
+++ b/atac-driver/libatac/atacFeature.H
@@ -0,0 +1,78 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef ATAC_FEATURE_H
+#define ATAC_FEATURE_H
+
+#include <string>
+#include <map>
+using namespace std;
+
+#include "bio++.H"
+
+#include "seqCache.H"
+
+// A barebones feature.
+//
+// F type featureuid parentuid LABEL:IID POS
+//
+// The idea is to mark some region as containing some feature. You
+// can use the featureiid to map to a chunk of non-atac data, e.g., a
+// strucure containing information about the feature.
+//
+// An EST feature could be represented as:
+// F est est4 . B35:3 423551 10421
+// F exon exon7 est4 B35:3 423551 346
+// F exon exon8 est4 B35:3 425931 146
+// F exon exon9 est4 B35:3 433426 546
+//
+// There is nothing there that immediately links these atac features
+// to their respective est/exon data structures. This is a Good
+// Thing (tm) because usually we don't have any form of ID with those
+// data structures, so we'd be using an offset or something, which
+// would change if the features are filtered.
+//
+class atacFeature {
+public:
+ atacFeature() {};
+ atacFeature(char *line);
+ atacFeature(char *fuid,
+ char *puid,
+ uint32 fiid,
+ char *t,
+ uint32 i, uint32 p, uint32 l);
+
+ void decode(char *line);
+
+ bool sanity(seqCache *A, char *inLine);
+
+ char featureuid[16];
+ char parentuid[16];
+ uint32 featureiid;
+ char type[4];
+ uint32 iid, pos, len;
+
+ void print(FILE *f, char const *label) const {
+ fprintf(f, "F %s %s %s %s:"uint32FMT" "uint32FMT" "uint32FMT"\n",
+ type,
+ featureuid, parentuid,
+ label, iid, pos, len);
+ };
+};
+
+#endif // ATAC_FEATURE_H
diff --git a/atac-driver/libatac/atacFeatureList.C b/atac-driver/libatac/atacFeatureList.C
new file mode 100644
index 0000000..afa925f
--- /dev/null
+++ b/atac-driver/libatac/atacFeatureList.C
@@ -0,0 +1,119 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+
+atacFeatureList::atacFeatureList() {
+ _featuresLen = 0;
+ _featuresMax = 256;
+ _features = new atacFeature [_featuresMax];
+}
+
+atacFeatureList::~atacFeatureList() {
+ delete [] _features;
+}
+
+void
+atacFeatureList::add(atacFeature &m) {
+
+ if (_featuresLen >= _featuresMax) {
+ _featuresMax <<= 2;
+ atacFeature *A = new atacFeature [_featuresMax];
+ memcpy(A, _features, sizeof(atacFeature) * _featuresLen);
+ delete [] _features;
+ _features = A;
+ }
+
+ memcpy(&_features[_featuresLen], &m, sizeof(atacFeature));
+
+ _features[_featuresLen].featureiid = _featuresLen++;
+}
+
+
+
+static
+int
+sort_(const void *a, const void *b) {
+ const atacFeature *A = (const atacFeature *)a;
+ const atacFeature *B = (const atacFeature *)b;
+
+ if (A->iid < B->iid) return(-1);
+ if (A->iid > B->iid) return(1);
+ if (A->pos < B->pos) return(-1);
+ if (A->pos > B->pos) return(1);
+ if (A->len > B->len) return(-1);
+ if (A->len < B->len) return(1);
+ return(0);
+}
+
+static
+int
+sortfeatureuid_(const void *a, const void *b) {
+ const atacFeature *A = (const atacFeature *)a;
+ const atacFeature *B = (const atacFeature *)b;
+
+ int r = strcmp(A->featureuid, B->featureuid);
+ if (r < 0) return(-1);
+ if (r > 0) return(1);
+ r = strcmp(A->parentuid, B->parentuid);
+ if (r < 0) return(-1);
+ if (r > 0) return(1);
+
+ return(0);
+}
+
+static
+int
+sortparentuid_(const void *a, const void *b) {
+ const atacFeature *A = (const atacFeature *)a;
+ const atacFeature *B = (const atacFeature *)b;
+
+ int r = strcmp(A->parentuid, B->parentuid);
+ if (r < 0) return(-1);
+ if (r > 0) return(1);
+ r = strcmp(A->featureuid, B->featureuid);
+ if (r < 0) return(-1);
+ if (r > 0) return(1);
+
+ return(0);
+}
+
+
+void
+atacFeatureList::sort(uint32 first, uint32 len) {
+ if (len == 0) len = _featuresLen;
+ qsort(_features + first, len, sizeof(atacFeature), sort_);
+}
+
+void
+atacFeatureList::sortFeatureUID(uint32 first, uint32 len) {
+ if (len == 0) len = _featuresLen;
+ qsort(_features + first, len, sizeof(atacFeature), sortfeatureuid_);
+}
+
+void
+atacFeatureList::sortParentUID(uint32 first, uint32 len) {
+ if (len == 0) len = _featuresLen;
+ qsort(_features + first, len, sizeof(atacFeature), sortparentuid_);
+}
diff --git a/atac-driver/libatac/atacFeatureList.H b/atac-driver/libatac/atacFeatureList.H
new file mode 100644
index 0000000..e2e8372
--- /dev/null
+++ b/atac-driver/libatac/atacFeatureList.H
@@ -0,0 +1,52 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef ATAC_FEATURELIST_H
+#define ATAC_FEATURELIST_H
+
+#include <string>
+#include <map>
+using namespace std;
+
+#include "bio++.H"
+
+
+class atacFeatureList {
+public:
+ atacFeatureList();
+ ~atacFeatureList();
+
+ uint32 numberOfFeatures(void) { return(_featuresLen); };
+ atacFeature *getFeature(uint32 i) { return(_features + i); };
+ atacFeature *operator[](uint32 i) { return(_features + i); };
+
+ void add(atacFeature &m);
+ //void delete();
+
+ // Sort by the A assembly or B assembly location
+ void sort(uint32 first=0, uint32 len=0);
+ void sortFeatureUID(uint32 first=0, uint32 len=0);
+ void sortParentUID(uint32 first=0, uint32 len=0);
+
+private:
+ uint32 _featuresLen;
+ uint32 _featuresMax;
+ atacFeature *_features;
+};
+
+#endif // ATAC_FEATURELIST_H
diff --git a/atac-driver/libatac/atacFile.C b/atac-driver/libatac/atacFile.C
new file mode 100644
index 0000000..5aeecbd
--- /dev/null
+++ b/atac-driver/libatac/atacFile.C
@@ -0,0 +1,300 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005, 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+
+static
+bool
+isHeader(char *inLine) {
+ return((inLine[0] == '!') ||
+ (inLine[0] == '#') ||
+ (inLine[0] == '/'));
+}
+
+
+atacFileStream::atacFileStream(char const *filename) {
+
+ if (filename == 0L)
+ return;
+
+ _inFile = stdin;
+
+ if ((filename != 0L) && (strcmp(filename, "-") != 0)) {
+ errno = 0;
+ _inFile = fopen(filename, "r");
+ if (errno)
+ fprintf(stderr, "atacFileStream::atacFileStream()-- failed to open %s: %s\n", filename, strerror(errno)), exit(1);
+ }
+
+ _theMatchIID = 0;
+ _theFeatureIID = 0;
+
+ readHeader(_inLine, _inFile);
+}
+
+atacFileStream::~atacFileStream() {
+};
+
+
+atacMatch*
+atacFileStream::nextMatch(char type) {
+ atacMatch *ret = 0L;
+
+ while ((ret == 0L) &&
+ (feof(_inFile) == false)) {
+
+ if (_inLine[0] == 'M') {
+ _theMatch.decode(_inLine);
+
+ if (_theMatch.type[0] == type) {
+ _theMatch.matchiid = _theMatchIID++;
+ ret = &_theMatch;
+ }
+ }
+
+ fgets(_inLine, 1024, _inFile);
+ }
+
+ return(ret);
+}
+
+
+atacFeature*
+atacFileStream::nextFeature(char type[4]) {
+ atacFeature *ret = 0L;
+
+ while ((ret == 0L) &&
+ (feof(_inFile) == false)) {
+
+ if (_inLine[0] == 'F') {
+ _theFeature.decode(_inLine);
+
+ // Return the feature if it is the correct type. 0 is the
+ // wildcard.
+ //
+ if (((_theFeature.type[0] == 0) || (type[0] == 0) || (_theFeature.type[0] == type[0])) &&
+ ((_theFeature.type[1] == 0) || (type[1] == 0) || (_theFeature.type[1] == type[1])) &&
+ ((_theFeature.type[2] == 0) || (type[2] == 0) || (_theFeature.type[2] == type[2])) &&
+ ((_theFeature.type[3] == 0) || (type[3] == 0) || (_theFeature.type[3] == type[3]))) {
+ _theFeature.featureiid = _theFeatureIID++;
+ ret = &_theFeature;
+ }
+ }
+
+ fgets(_inLine, 1024, _inFile);
+ }
+
+ return(ret);
+}
+
+
+
+
+atacFile::atacFile(char const *filename) {
+
+ if (filename == 0L)
+ return;
+
+ FILE *inFile = stdin;
+ char inLine[1024];
+
+ if ((filename != 0L) && (strcmp(filename, "-") != 0)) {
+ errno = 0;
+ inFile = fopen(filename, "r");
+ if (errno)
+ fprintf(stderr, "atacFile::atacFile()-- failed to load %s: %s\n", filename, strerror(errno)), exit(1);
+ }
+
+ // Read the preamble, look for our data sources. This leaves us with
+ // the first match in the inLine, and fills in fileA and fileB.
+ //
+ readHeader(inLine, inFile);
+
+ while (!feof(inFile)) {
+ switch(inLine[0]) {
+ case 'M':
+ {
+ atacMatch m(inLine);
+
+ if (m.sanity(fastaA(), fastaB(), inLine)) {
+ if ((m.type[0] == 'u') ||
+ (m.type[0] == 'x')) {
+ _matches.add(m);
+ } else if (m.type[0] == 'r') {
+ _runs.add(m);
+ } else if (m.type[0] == 'c') {
+ _clumps.add(m);
+ } else {
+ chomp(inLine);
+ fprintf(stderr, "atacFile::atacFile()-- Unknown match record type '%c' -- '%s'.\n", m.type[0], inLine);
+ }
+ }
+ }
+ break;
+
+ case 'F':
+ {
+ chomp(inLine);
+ fprintf(stderr, "atacFile::atacFile()-- Unknown feature record -- '%s'.\n", inLine);
+ }
+ break;
+
+ default:
+ {
+ chomp(inLine);
+ fprintf(stderr, "atacFile::atacFile()-- Unknown record -- '%s'.\n", inLine);
+ }
+ break;
+ }
+
+ fgets(inLine, 1024, inFile);
+ }
+}
+
+atacFile::~atacFile() {
+}
+
+
+
+
+
+
+
+
+atacFileBase::atacFileBase() {
+ _fileA[0] = 0;
+ _fileB[0] = 0;
+
+ _labelA[0] = 0;
+ _labelB[0] = 0;
+
+ _seqA = 0L;
+ _seqB = 0L;
+}
+
+
+atacFileBase::~atacFileBase() {
+ delete _seqA;
+ delete _seqB;
+}
+
+
+void
+atacFileBase::readHeader(char *inLine, FILE *in) {
+
+ fgets(inLine, 1024, in);
+ while (!feof(in) && isHeader(inLine)) {
+ chomp(inLine);
+
+ if (inLine[0] == '/') {
+ char *key = inLine + 1;
+ char *val = inLine + 1;
+
+ while (isspace(*key)) key++; // Skip whitespace between "/" and the key
+ while (*val != '=') val++; // Move to the "="
+ *val++ = 0; // Terminate the key
+ while (isspace(*val)) val++; // Skip whitespace between "=" and the val
+
+ chomp(key);
+ chomp(val);
+
+ //fprintf(stderr, "key='%s' val='%s'\n", key, val);
+
+ string K = key;
+ string V = val;
+ _params[K] = V;
+
+ // Save ones we use
+
+ if (strncmp(key, "assemblyFile1", 14) == 0)
+ strcpy(_fileA, val);
+
+ if (strncmp(key, "assemblyFile2", 14) == 0)
+ strcpy(_fileB, val);
+
+ if (strncmp(key, "assemblyId1", 12) == 0)
+ strcpy(_labelA, val);
+
+ if (strncmp(key, "assemblyId2", 12) == 0)
+ strcpy(_labelB, val);
+ }
+
+ // Otherwise, it's a comment or the header
+
+ fgets(inLine, 1024, in);
+ }
+
+ //fprintf(stderr, "assemblyFile1 = '%s'\n", _fileA);
+ //fprintf(stderr, "assemblyFile2 = '%s'\n", _fileB);
+ //fprintf(stderr, "assemblyId1 = '%s'\n", _labelA);
+ //fprintf(stderr, "assemblyId2 = '%s'\n", _labelB);
+
+ // Open some seqCache for each of the files
+ //
+ if (_fileA && _fileA[0]) {
+ if (fileExists(_fileA)) {
+ _seqA = new seqCache(_fileA);
+ } else {
+ fprintf(stderr, "atacFile::readHeader()-- can't find '%s', no sequence read.\n", _fileA);
+ }
+ }
+ if (_fileB && _fileB[0]) {
+ if (fileExists(_fileA)) {
+ _seqB = new seqCache(_fileB);
+ } else {
+ fprintf(stderr, "atacFile::readHeader()-- can't find '%s', no sequence read.\n", _fileB);
+ }
+ }
+}
+
+
+
+void
+atacFileBase::writeHeader(FILE *out) {
+
+ if (out == 0L)
+ out = stdout;
+
+ fprintf(out, "!format atac 1.0\n");
+ fprintf(out, "#\n");
+ fprintf(out, "# Legend:\n");
+ fprintf(out, "#\n");
+ fprintf(out, "# Field 0: the row class\n");
+ fprintf(out, "# Field 1: the match type u=ungapped, x=exact, ....\n");
+ fprintf(out, "# Field 2: the match instance index\n");
+ fprintf(out, "# Field 3: the parent index\n");
+ fprintf(out, "# Field 4: the FASTA sequence id in the first assembly\n");
+ fprintf(out, "# Field 5: the offset from the start of the sequence for the match\n");
+ fprintf(out, "# Field 6: the length of the match in the first assembly\n");
+ fprintf(out, "# Field 7: the orientation of the match sequence in the first assembly.\n");
+ fprintf(out, "# Field 8: the FASTA sequence id for the second assembly\n");
+ fprintf(out, "# Field 9: the offset from the start of the sequence for the match\n");
+ fprintf(out, "# Field 10: the length of the match in the second assembly\n");
+ fprintf(out, "# Field 11: the orientation of the match sequence in the second assembly.\n");
+ fprintf(out, "#\n");
+
+ map<string,string>::iterator it;
+ for (it=_params.begin(); it != _params.end(); it++)
+ fprintf(out, "/%s=%s\n", it->first.c_str(), it->second.c_str());
+}
diff --git a/atac-driver/libatac/atacFileStreamMerge.C b/atac-driver/libatac/atacFileStreamMerge.C
new file mode 100644
index 0000000..663f3f4
--- /dev/null
+++ b/atac-driver/libatac/atacFileStreamMerge.C
@@ -0,0 +1,172 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2007 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+
+class afsm {
+public:
+ atacFileStream *_theFile;
+ atacMatch *_theMatch;
+ atacFeature *_theFeature;
+
+ bool _endOfFile;
+
+ afsm() {
+ _theFile = 0L;
+ _theMatch = 0L;
+ _theFeature = 0L;
+ _endOfFile = false;
+ };
+
+ ~afsm() {
+ };
+};
+
+
+
+atacFileStreamMerge::atacFileStreamMerge(void) {
+ _filesLen = 0;
+ _filesMax = 4;
+ _files = new afsm [_filesMax];
+ _theMatchIID = 0;
+ _theFeatureIID = 0;
+}
+
+atacFileStreamMerge::~atacFileStreamMerge(void) {
+ for (uint32 i=0; i<_filesLen; i++)
+ delete _files[i]._theFile;
+ delete [] _files;
+
+ // But wait! Unless we munge our copies of data, we'll free things
+ // again when atacFileBase destructs!
+ //
+ _seqA = 0L;
+ _seqB = 0L;
+}
+
+
+
+void
+atacFileStreamMerge::writeHeader(FILE *out) {
+ if (_files[0]._theFile != 0L)
+ _files[0]._theFile->writeHeader(out);
+}
+
+
+void
+atacFileStreamMerge::addFile(char const *filename) {
+
+ if (filename == 0L)
+ return;
+
+ if (_filesLen >= _filesMax) {
+ _filesMax *= 2;
+ afsm *F = new afsm [_filesMax];
+ memcpy(F, _files, sizeof(afsm) * _filesLen);
+ delete [] _files;
+ _files = F;
+ }
+
+ _files[_filesLen]._theFile = new atacFileStream(filename);
+
+ // Duplicate a bunch of stuff to our file.
+ //
+ if (_filesLen == 0) {
+ strcpy(_fileA, _files[_filesLen]._theFile->assemblyFileA());
+ strcpy(_fileB, _files[_filesLen]._theFile->assemblyFileB());
+
+ strcpy(_labelA, _files[_filesLen]._theFile->labelA());
+ strcpy(_labelB, _files[_filesLen]._theFile->labelB());
+
+ //_params = _files[_filesLen]._theFile->_params;
+
+ _seqA = _files[_filesLen]._theFile->fastaA();
+ _seqB = _files[_filesLen]._theFile->fastaB();
+ }
+
+ _filesLen++;
+}
+
+
+
+atacMatch*
+atacFileStreamMerge::nextMatch(char type) {
+ atacMatch *theMatch;
+ uint32 theMatchIdx;
+
+ // Make sure everyone has a match
+ //
+ for (uint32 i=0; i<_filesLen; i++) {
+ if (_files[i]._endOfFile == false) {
+ if (_files[i]._theMatch == 0L)
+ _files[i]._theMatch = _files[i]._theFile->nextMatch(type);
+ if (_files[i]._theMatch == 0L)
+ _files[i]._endOfFile = true;
+ }
+ }
+
+ // Pick the smallest.
+ //
+ theMatch = _files[0]._theMatch;
+ theMatchIdx = 0;
+
+
+ // need to set matchIID
+ // should probably also make a new match UID, or better, fix seatac to make UIDs
+
+
+ for (uint32 i=1; i<_filesLen; i++) {
+ if (_files[i]._theMatch) {
+ if (theMatch == 0L) {
+ theMatch = _files[i]._theMatch;
+ theMatchIdx = i;
+ }
+
+ if (theMatch) {
+ if (_files[i]._theMatch->iid1 < theMatch->iid1) {
+ theMatch = _files[i]._theMatch;
+ theMatchIdx = i;
+ }
+
+ if ((_files[i]._theMatch->iid1 <= theMatch->iid1) &&
+ (_files[i]._theMatch->iid2 <= theMatch->iid2)) {
+ theMatch = _files[i]._theMatch;
+ theMatchIdx = i;
+ }
+ }
+ }
+ }
+
+ // Mark it as used
+ //
+ _files[theMatchIdx]._theMatch = 0L;
+
+ return(theMatch);
+}
+
+
+atacFeature*
+atacFileStreamMerge::nextFeature(char type[4]) {
+ return(0L);
+}
diff --git a/atac-driver/libatac/atacMatch.C b/atac-driver/libatac/atacMatch.C
new file mode 100644
index 0000000..83677c1
--- /dev/null
+++ b/atac-driver/libatac/atacMatch.C
@@ -0,0 +1,154 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include "atac.H"
+
+static
+uint32
+decodeAtacName(char *atac,
+ char *label) {
+ if (label) {
+ while (*atac && (*atac != ':'))
+ *label++ = *atac++;
+ *label = 0;
+ } else {
+ while (*atac && (*atac != ':'))
+ atac++;
+ }
+ if (*atac)
+ return(strtouint32(atac+1, 0L));
+ return(~uint32ZERO);
+}
+
+
+atacMatch::atacMatch(char *line) {
+ decode(line);
+}
+
+atacMatch::atacMatch(char *muid,
+ char *puid,
+ uint32 miid,
+ char *t,
+ uint32 i1, uint32 p1, uint32 l1, uint32 f1,
+ uint32 i2, uint32 p2, uint32 l2, uint32 f2) {
+
+ strncpy(matchuid, muid, 16);
+ strncpy(parentuid, puid, 16);
+
+ matchuid[15] = 0;
+ parentuid[15] = 0;
+
+ matchiid = miid;
+
+ type[0] = 0;
+ type[1] = 0;
+ type[2] = 0;
+ type[3] = 0;
+
+ type[0] = t[0];
+ type[1] = t[1];
+ if (t[1])
+ type[2] = t[2];
+
+ iid1 = i1;
+ pos1 = p1;
+ len1 = l1;
+ fwd1 = f1;
+ iid2 = i2;
+ pos2 = p2;
+ len2 = l2;
+ fwd2 = f2;
+}
+
+void
+atacMatch::decode(char *line) {
+ iid1 = 0;
+ pos1 = 0;
+ len1 = 0;
+ fwd1 = 0;
+ iid2 = 0;
+ pos2 = 0;
+ len2 = 0;
+ fwd2 = 0;
+
+ splitToWords S(line);
+
+ iid1 = decodeAtacName(S[4], 0L);
+ pos1 = strtouint32(S[5], 0L);
+ len1 = strtouint32(S[6], 0L);
+ fwd1 = (S[7][0] == '-') ? 0 : 1;
+ iid2 = decodeAtacName(S[8], 0L);
+ pos2 = strtouint32(S[9], 0L);
+ len2 = strtouint32(S[10], 0L);
+ fwd2 = (S[11][0] == '-') ? 0 : 1;
+
+ strncpy(matchuid, S[2], 16);
+ strncpy(parentuid, S[3], 16);
+
+ matchuid[15] = 0;
+ parentuid[15] = 0;
+
+ matchiid = 0;
+
+ type[0] = 0;
+ type[1] = 0;
+ type[2] = 0;
+ type[3] = 0;
+
+ type[0] = S[1][0];
+ type[1] = S[1][1];
+ if (S[1][1])
+ type[2] = S[1][2];
+}
+
+
+// Sanity check the match record -- make sure it's within the
+// sequence itself.
+//
+bool
+atacMatch::sanity(seqCache *A, seqCache *B, char *inLine) {
+
+ bool matchOK = true;
+
+ if (A && B) {
+ if ((pos1) > A->getSequenceLength(iid1) || (pos1 + len1) > A->getSequenceLength(iid1)) {
+ chomp(inLine);
+ fprintf(stderr, "Match longer than sequence (by "uint32FMT"bp) in 1: seqLen="uint32FMTW(8)" %s\n",
+ pos1 + len1 - A->getSequenceLength(iid1),
+ A->getSequenceLength(iid1), inLine);
+ matchOK = false;
+ }
+
+ if ((pos2) > B->getSequenceLength(iid2) || (pos2 + len2) > B->getSequenceLength(iid2)) {
+ chomp(inLine);
+ fprintf(stderr, "Match longer than sequence (by "uint32FMT"bp) in 2: seqLen="uint32FMTW(8)" %s\n",
+ pos2 + len2 - B->getSequenceLength(iid2),
+ B->getSequenceLength(iid2), inLine);
+ matchOK = false;
+ }
+
+ if ((iid1 >= A->getNumberOfSequences()) || (iid2 >= B->getNumberOfSequences())) {
+ chomp(inLine);
+ fprintf(stderr, "Match references invalid sequence iid: %s\n", inLine);
+ matchOK = false;
+ }
+ }
+
+ return(matchOK);
+}
+
diff --git a/atac-driver/libatac/atacMatch.H b/atac-driver/libatac/atacMatch.H
new file mode 100644
index 0000000..6969ed2
--- /dev/null
+++ b/atac-driver/libatac/atacMatch.H
@@ -0,0 +1,72 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef ATAC_MATCH_H
+#define ATAC_MATCH_H
+
+#include <string>
+#include <map>
+using namespace std;
+
+#include "bio++.H"
+
+#include "seqCache.H"
+
+class atacMatch {
+public:
+ atacMatch() {
+ matchuid[0] = 0;
+ parentuid[0] = 0;
+ matchiid = 0;
+ type[0] = 0;
+ iid1 = pos1 = len1 = fwd1 = 0;
+ iid2 = pos2 = len2 = fwd2 = 0;
+ };
+ atacMatch(char *line);
+ atacMatch(char *muid,
+ char *puid,
+ uint32 miid,
+ char *t,
+ uint32 i1, uint32 p1, uint32 l1, uint32 f1,
+ uint32 i2, uint32 p2, uint32 l2, uint32 f2);
+
+ void decode(char *line);
+
+ // Sanity check the match record -- make sure it's within the
+ // sequence itself.
+ //
+ bool sanity(seqCache *A, seqCache *B, char *inLine);
+
+
+ char matchuid[16]; // external id
+ char parentuid[16]; // external parent id
+ uint32 matchiid; // internal id, usually pointing to an entry in atacMatchList
+ char type[4]; // right now, only need one byte, but we keep things aligned
+ uint32 iid1, pos1, len1, fwd1;
+ uint32 iid2, pos2, len2, fwd2;
+
+ void print(FILE *f, char const *label1, char const *label2) const {
+ fprintf(f, "M %s %s %s %s:"uint32FMT" "uint32FMT" "uint32FMT" %d %s:"uint32FMT" "uint32FMT" "uint32FMT" %d\n",
+ type,
+ matchuid, parentuid,
+ label1, iid1, pos1, len1, fwd1 ? 1 : -1,
+ label2, iid2, pos2, len2, fwd2 ? 1 : -1);
+ };
+};
+
+#endif // ATAC_MATCH_H
diff --git a/atac-driver/libatac/atacMatchList.C b/atac-driver/libatac/atacMatchList.C
new file mode 100644
index 0000000..146d2d0
--- /dev/null
+++ b/atac-driver/libatac/atacMatchList.C
@@ -0,0 +1,51 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005, 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+
+atacMatchList::atacMatchList() {
+ _matchesLen = 0;
+ _matchesMax = 256;
+ _matches = new atacMatch [_matchesMax];
+}
+
+atacMatchList::~atacMatchList() {
+ delete [] _matches;
+}
+
+void
+atacMatchList::add(atacMatch &m) {
+
+ if (_matchesLen >= _matchesMax) {
+ _matchesMax <<= 2;
+ atacMatch *A = new atacMatch [_matchesMax];
+ memcpy(A, _matches, sizeof(atacMatch) * _matchesLen);
+ delete [] _matches;
+ _matches = A;
+ }
+
+ memcpy(&_matches[_matchesLen], &m, sizeof(atacMatch));
+
+ _matches[_matchesLen].matchiid = _matchesLen++;
+}
diff --git a/atac-driver/libatac/atacMatchList.H b/atac-driver/libatac/atacMatchList.H
new file mode 100644
index 0000000..a9a7d0a
--- /dev/null
+++ b/atac-driver/libatac/atacMatchList.H
@@ -0,0 +1,49 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef ATAC_MATCHLIST_H
+#define ATAC_MATCHLIST_H
+
+#include <string>
+#include <map>
+using namespace std;
+
+#include "bio++.H"
+
+
+class atacMatchList {
+public:
+ atacMatchList();
+ ~atacMatchList();
+
+ void add(atacMatch &m);
+ //void delete();
+
+ uint32 numMatches(void) { return(_matchesLen); };
+ uint32 numberOfMatches(void) { return(_matchesLen); };
+
+ atacMatch *getMatch(uint32 i) { return(_matches + i); };
+ atacMatch *operator[](uint32 i) { return(_matches + i); };
+
+private:
+ uint32 _matchesLen;
+ uint32 _matchesMax;
+ atacMatch *_matches;
+};
+
+#endif // ATAC_MATCHLIST_H
diff --git a/atac-driver/libatac/atacMatchOrder.C b/atac-driver/libatac/atacMatchOrder.C
new file mode 100644
index 0000000..e39a6e3
--- /dev/null
+++ b/atac-driver/libatac/atacMatchOrder.C
@@ -0,0 +1,218 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005, 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "atac.H"
+
+
+void
+atacMatchOrder::mergeMatches(atacMatch *l, atacMatch *r, uint32 mergeuid) {
+ atacMatch n;
+
+ // Create a new match record for the merged match. We could
+ // probably do this inplace in l.
+
+ // Copy all the defaults from L first. This copies most of the stuff.
+ //
+ memcpy(&n, l, sizeof(atacMatch));
+
+ sprintf(n.matchuid, "merge"uint32FMT, mergeuid);
+
+ n.len1 = (r->pos1 + r->len1) - (l->pos1);
+ n.len2 = n.len1;
+
+ if (r->fwd2 == false)
+ n.pos2 = r->pos2;
+
+ n.fwd2 = r->fwd2;
+
+ // Update l with the new contents.
+
+ memcpy(l, &n, sizeof(atacMatch));
+
+ // Remove the r match from our set. The hardest part is figuring
+ // out what index the r match is at. The easiest way to do that is
+ // the most inefficient (start at zero, when we find the r match,
+ // start updating). The quickest way (given we want an array)
+ // makes us trust our index.
+ //
+ _matchesLen--;
+ for (uint32 idx = index(r->matchiid); idx < _matchesLen; idx++) {
+ _matches[idx] = _matches[idx+1];
+ _matchIIDtoIdx[_matches[idx]->matchiid] = idx;
+ }
+}
+
+
+
+static
+int
+sortA_(const void *a, const void *b) {
+ const atacMatch *A = *(const atacMatch * const *)a;
+ const atacMatch *B = *(const atacMatch * const *)b;
+
+ if (A->iid1 < B->iid1) return(-1);
+ if (A->iid1 > B->iid1) return(1);
+ if (A->pos1 < B->pos1) return(-1);
+ if (A->pos1 > B->pos1) return(1);
+ if (A->len1 > B->len1) return(-1);
+ if (A->len1 < B->len1) return(1);
+ if (A->iid2 < B->iid2) return(-1);
+ if (A->iid2 > B->iid2) return(1);
+ if (A->pos2 < B->pos2) return(-1);
+ if (A->pos2 > B->pos2) return(1);
+ if (A->len2 > B->len2) return(-1);
+ if (A->len2 < B->len2) return(1);
+ return(0);
+}
+
+static
+int
+sortB_(const void *a, const void *b) {
+ const atacMatch *A = *(const atacMatch * const *)a;
+ const atacMatch *B = *(const atacMatch * const *)b;
+
+ if (A->iid2 < B->iid2) return(-1);
+ if (A->iid2 > B->iid2) return(1);
+ if (A->pos2 < B->pos2) return(-1);
+ if (A->pos2 > B->pos2) return(1);
+ if (A->len2 > B->len2) return(-1);
+ if (A->len2 < B->len2) return(1);
+ if (A->iid1 < B->iid1) return(-1);
+ if (A->iid1 > B->iid1) return(1);
+ if (A->pos1 < B->pos1) return(-1);
+ if (A->pos1 > B->pos1) return(1);
+ if (A->len1 > B->len1) return(-1);
+ if (A->len1 < B->len1) return(1);
+
+ return(0);
+}
+
+static
+int
+sortdiagonal_(const void *a, const void *b) {
+ const atacMatch *A = *(const atacMatch * const *)a;
+ const atacMatch *B = *(const atacMatch * const *)b;
+
+ if (A->iid2 < B->iid2) return(-1);
+ if (A->iid2 > B->iid2) return(1);
+ if (A->iid1 < B->iid1) return(-1);
+ if (A->iid1 > B->iid1) return(1);
+ if (A->fwd2 < B->fwd2) return(-1);
+ if (A->fwd2 > B->fwd2) return(1);
+
+ // We're now in the same sequence pair with the same orientation.
+
+ // So much easier if we use signed math.
+
+ // This works for forward matches
+ int32 dA = (int32)A->pos2 - (int32)A->pos1;
+ int32 dB = (int32)B->pos2 - (int32)B->pos1;
+
+ if (A->fwd2 == 0) {
+ // OK, so not the greatest diagonal computation ever. We end up
+ // with a gigantic discontinuity at the origin, but we don't
+ // care, just as long as the diagonals are distinct.
+ //
+ dA = (int32)A->pos2 - (1000000000 - (int32)(A->pos2 + A->len2));
+ dB = (int32)B->pos2 - (1000000000 - (int32)(B->pos2 + B->len2));
+ }
+
+ if (dA < dB) return(-1);
+ if (dA > dB) return(1);
+
+ // This is just candy; might make things easier later
+ if (A->pos1 < B->pos1) return(-1);
+ if (A->pos1 > B->pos1) return(1);
+ if (A->len1 > B->len1) return(-1);
+ if (A->len1 < B->len1) return(1);
+
+ return(0);
+}
+
+static
+int
+sortmatchuid_(const void *a, const void *b) {
+ const atacMatch *A = *(const atacMatch * const *)a;
+ const atacMatch *B = *(const atacMatch * const *)b;
+
+ int r = strcmp(A->matchuid, B->matchuid);
+ if (r < 0) return(-1);
+ if (r > 0) return(1);
+ r = strcmp(A->parentuid, B->parentuid);
+ if (r < 0) return(-1);
+ if (r > 0) return(1);
+
+ return(0);
+}
+
+static
+int
+sortparentuid_(const void *a, const void *b) {
+ const atacMatch *A = *(const atacMatch * const *)a;
+ const atacMatch *B = *(const atacMatch * const *)b;
+
+ int r = strcmp(A->parentuid, B->parentuid);
+ if (r < 0) return(-1);
+ if (r > 0) return(1);
+ r = strcmp(A->matchuid, B->matchuid);
+ if (r < 0) return(-1);
+ if (r > 0) return(1);
+
+ return(0);
+}
+
+
+void
+atacMatchOrder::sortA(uint32 first, uint32 len) {
+ if (len == 0) len = _matchesLen;
+ qsort(_matches + first, len, sizeof(atacMatch*), sortA_);
+ updateIndex();
+}
+
+void
+atacMatchOrder::sortB(uint32 first, uint32 len) {
+ if (len == 0) len = _matchesLen;
+ qsort(_matches + first, len, sizeof(atacMatch*), sortB_);
+ updateIndex();
+}
+
+void
+atacMatchOrder::sortDiagonal(uint32 first, uint32 len) {
+ if (len == 0) len = _matchesLen;
+ qsort(_matches + first, len, sizeof(atacMatch*), sortdiagonal_);
+ updateIndex();
+}
+
+void
+atacMatchOrder::sortMatchUID(uint32 first, uint32 len) {
+ if (len == 0) len = _matchesLen;
+ qsort(_matches + first, len, sizeof(atacMatch*), sortmatchuid_);
+ updateIndex();
+}
+
+void
+atacMatchOrder::sortParentUID(uint32 first, uint32 len) {
+ if (len == 0) len = _matchesLen;
+ qsort(_matches + first, len, sizeof(atacMatch*), sortparentuid_);
+ updateIndex();
+}
diff --git a/atac-driver/libatac/atacMatchOrder.H b/atac-driver/libatac/atacMatchOrder.H
new file mode 100644
index 0000000..2647699
--- /dev/null
+++ b/atac-driver/libatac/atacMatchOrder.H
@@ -0,0 +1,99 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#ifndef ATAC_MATCHORDER_H
+#define ATAC_MATCHORDER_H
+
+#include <string>
+#include <map>
+using namespace std;
+
+#include "bio++.H"
+
+
+class atacMatchOrder {
+private:
+ void initialize(atacMatchList *ML) {
+ _matchesLen = ML->numberOfMatches();
+ _matchesMax = ML->numberOfMatches();
+ _matches = new atacMatch * [_matchesLen];
+ _matchIIDtoIdx = new uint32 [_matchesLen];
+ for (uint32 i=0; i<_matchesLen; i++) {
+ _matches[i] = ML->getMatch(i);
+ _matchIIDtoIdx[i] = ~uint32ZERO;
+ }
+ };
+public:
+ atacMatchOrder(atacMatchList *ML) {
+ initialize(ML);
+ };
+ atacMatchOrder(atacMatchList &ML) {
+ initialize(&ML);
+ };
+ ~atacMatchOrder() {
+ delete [] _matches;
+ };
+
+ uint32 numMatches(void) { return(_matchesLen); };
+ uint32 numberOfMatches(void) { return(_matchesLen); };
+
+ // Return match i in our list. Common usage would be
+ // getMatch(index(iid) + 1) // return the match after the one we have
+ // getMatch(index(iid) - 1)
+ //
+ atacMatch *getMatch(uint32 i) {
+ return(_matches[i]);
+ };
+ atacMatch *operator[](uint32 i) {
+ return(_matches[i]);
+ };
+
+ // Return the index, in our sorted list, of the matchiid supplied.
+ //
+ uint32 index(uint32 matchiid) {
+ return(_matchIIDtoIdx[matchiid]);
+ };
+
+ // Merge the r match into the l match. The l match gets the result.
+ void mergeMatches(atacMatch *l, atacMatch *r, uint32 mergeuid);
+
+ void sortA(uint32 first=0, uint32 len=0);
+ void sortB(uint32 first=0, uint32 len=0);
+ void sortDiagonal(uint32 first=0, uint32 len=0);
+ void sortMatchUID(uint32 first=0, uint32 len=0);
+ void sortParentUID(uint32 first=0, uint32 len=0);
+
+private:
+ uint32 _matchesLen;
+ uint32 _matchesMax;
+ atacMatch **_matches;
+
+ atacMatchList *_ML;
+
+ void updateIndex(void) {
+ for (uint32 i=0; i<_matchesLen; i++)
+ _matchIIDtoIdx[i] = ~uint32ZERO;
+ for (uint32 i=0; i<_matchesLen; i++)
+ _matchIIDtoIdx[_matches[i]->matchiid] = i;
+ };
+
+ uint32 *_matchIIDtoIdx;
+};
+
+
+#endif // ATAC_MATCHORDER_H
diff --git a/atac-driver/libatac/fasta-accessor-test.C b/atac-driver/libatac/fasta-accessor-test.C
new file mode 100644
index 0000000..ebf06b3
--- /dev/null
+++ b/atac-driver/libatac/fasta-accessor-test.C
@@ -0,0 +1,259 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "fasta-accessor.H"
+
+
+
+// rev is the reverse complement of fwd, so if we make an accessor
+// from it, and set the reverse-complement flag we should get back
+// exactly fwd.
+//
+void
+simpleTest(void) {
+
+ // Yup, real sequence. I don't remember what it belongs to.
+
+ char *fwd =
+ "CGCTTTAATGGGCGAACAGCCCAACCCTTGGGAGAGACTCCACCCCCAGGATGCGACGAGCCGACATCGAGGTGCCAAAC"
+ "CATGCCGTCGATATGGACTCTGAGCGACGCCGCTTCCACAAGCCAGCGCCGGGTCACTAGTTCCGACTTTCGTCCCTGCT"
+ "CGACCTGCCGGTCTCGCAGTCAAGCTCCCTTGTGCACTTAGCCTCCGTTACCTTTTAGGAGGCAACCGCCCCAGTTAAAC"
+ "TACCCACCAGGCAATGTCCCTGATCCGGATCACGGACCTAGGTTAGATATCCAGAACGACGCTTCACAGTCTCCCACCTA"
+ "TCCTACACAAGCCGTACCGAACACCAATACCAAGCTATAGTAAAGGTCCCGGGGTCTTTCCGTCCTGCTGCGCGTAACGA"
+ "CAGTGGAGAAGTCGTTACGCCATTCGTGCAGGTCGGAACTTACCCGACAAGGAATTTCGCTACCTTAGGATGGTTATAGT"
+ "TACCACCGCCGTTTACTGGGTTAACCTTCCAGCACCGGGCAGGCGTCAGTCCGTATACATCGTCTTGCGACTTAGCACGG"
+ "ACCTGTGTTTTTAGTAAACAGTCGCTTCTCCCTGGTCTCTCCCCTTCTCCCGAAGTTACGGGGGTATTTTGCCGAGTTCC"
+ "TTAACCATGATTCACTCGATCGCCTTGGTATTCTCTACCTAACCACCTGAGTCGGTTTGGTAGGATCACCCTGCTTCCCG"
+ "CATTCGCGGTCACTATCAGGTCTCAGGATATGTGTGAGACGGATTTGCCTATCTCACTCCCTACACCCTTGGACGTGGAC"
+ "TTGACTACTACCAAATCGGGTCACGCGCTCCGCTCAACATTCCATCACCCGAAGGTGACAGAAAAAAGAGTTTTAGGCGT"
+ "TTAGCATCAAAAGGTTCATCTCGACTACGCCTGTCGGCCTCGCCTTAGGTCCCGACTTACCCAGGGCAGATTAGCTTGAC"
+ "CCTGGAACCCTTGGTTATTCGGCGGACGGGTTTCTCGCCC";
+
+ char *rev =
+ "GGGCGAGAAACCCGTCCGCCGAATAACCAAGGGTTCCAGGGTCAAGCTAATCTGCCCTGGGTAAGTCGGGACCTAAGGCG"
+ "AGGCCGACAGGCGTAGTCGAGATGAACCTTTTGATGCTAAACGCCTAAAACTCTTTTTTCTGTCACCTTCGGGTGATGGA"
+ "ATGTTGAGCGGAGCGCGTGACCCGATTTGGTAGTAGTCAAGTCCACGTCCAAGGGTGTAGGGAGTGAGATAGGCAAATCC"
+ "GTCTCACACATATCCTGAGACCTGATAGTGACCGCGAATGCGGGAAGCAGGGTGATCCTACCAAACCGACTCAGGTGGTT"
+ "AGGTAGAGAATACCAAGGCGATCGAGTGAATCATGGTTAAGGAACTCGGCAAAATACCCCCGTAACTTCGGGAGAAGGGG"
+ "AGAGACCAGGGAGAAGCGACTGTTTACTAAAAACACAGGTCCGTGCTAAGTCGCAAGACGATGTATACGGACTGACGCCT"
+ "GCCCGGTGCTGGAAGGTTAACCCAGTAAACGGCGGTGGTAACTATAACCATCCTAAGGTAGCGAAATTCCTTGTCGGGTA"
+ "AGTTCCGACCTGCACGAATGGCGTAACGACTTCTCCACTGTCGTTACGCGCAGCAGGACGGAAAGACCCCGGGACCTTTA"
+ "CTATAGCTTGGTATTGGTGTTCGGTACGGCTTGTGTAGGATAGGTGGGAGACTGTGAAGCGTCGTTCTGGATATCTAACC"
+ "TAGGTCCGTGATCCGGATCAGGGACATTGCCTGGTGGGTAGTTTAACTGGGGCGGTTGCCTCCTAAAAGGTAACGGAGGC"
+ "TAAGTGCACAAGGGAGCTTGACTGCGAGACCGGCAGGTCGAGCAGGGACGAAAGTCGGAACTAGTGACCCGGCGCTGGCT"
+ "TGTGGAAGCGGCGTCGCTCAGAGTCCATATCGACGGCATGGTTTGGCACCTCGATGTCGGCTCGTCGCATCCTGGGGGTG"
+ "GAGTCTCTCCCAAGGGTTGGGCTGTTCGCCCATTAAAGCG";
+
+ FastAAccessor F(fwd, 1000, false);
+ FastAAccessor R(rev, 1000, true);
+ uint32 C;
+
+ for (uint32 i=0; i<1000; i++)
+ if (F[i] != R[i])
+ exit(1);
+
+ F.setPosition(0);
+ R.setPosition(0);
+ for (C=0; F.isValid() && R.isValid(); ++C, ++F, ++R)
+ if (*F != *R)
+ exit(2);
+ if (C != 1000)
+ exit(3);
+
+ F.setPosition(999);
+ R.setPosition(999);
+ for (C=0; F.isValid() && R.isValid(); ++C, --F, --R)
+ if (*F != *R)
+ exit(4);
+ if (C != 1000)
+ exit(5);
+}
+
+
+
+// Test pulling out a subsequence. We're given coordinates in the
+// forward direction, but want to pull out the reverse complement
+// sequence.
+//
+void
+easierTest(void) {
+ char sub[1000];
+ int i;
+
+ // 100A 200N 100T 400N 200G
+ //
+ for (i=0; i<1000; i++)
+ sub[i] = 'N';
+ for (i=0; i<100; i++)
+ sub[i] = 'A';
+ for (i=300; i<400; i++)
+ sub[i] = 'T';
+ for (i=600; i<700; i++)
+ sub[i] = 'R';
+ for (i=800; i<1000; i++)
+ sub[i] = 'G';
+
+
+ // Pull out the reverse-complement sequence from 300-400
+ //
+ // Asking for sequence from 300 to 400 should give up exactly to
+ // 'A' (reverse-complelent of T) block.
+ //
+ // Without setting the range, we'd get back the sequence at
+ // 700-600, the location when globally reverse-complemented.
+ //
+ FastAAccessor S(sub, 1000, true);
+
+ S.setRange(300, 100);
+ for (i=300; i<400; i++)
+ if (S[i] != 'A')
+ fprintf(stderr, "FAILED: got %c at pos %d\n", S[i], i), exit(5);
+
+ S.setRange(0, 0);
+ for (i=300; i<400; i++)
+ if (S[i] != 'Y')
+ fprintf(stderr, "FAILED: got %c at pos %d\n", S[i], i), exit(6);
+}
+
+
+
+// A harder test: build an accessor to access sequence from 100 to
+// 300, and grow/shrink the region.
+//
+void
+harderTest(void) {
+ char sub[1000];
+ int e;
+
+ // 100A 200C 300G 400N
+ //
+ for (int i=0; i<1000; i++)
+ sub[i] = 'N';
+ for (int i=0; i<100; i++)
+ sub[i] = 'A';
+ for (int i=100; i<300; i++)
+ sub[i] = 'C';
+ for (int i=300; i<600; i++)
+ sub[i] = 'G';
+
+ // Try forward.
+
+ {
+ fprintf(stderr, "Forward setRange/setPosition\n");
+ FastAAccessor A(sub, 1000, false);
+
+ A.setRange(100, 200);
+ A.setPosition(100);
+
+ fprintf(stderr, "Range: "uint32FMT"-"uint32FMT" len="uint32FMT"\n",
+ A.getRangeBegin(), A.getRangeEnd(), A.getRangeLength());
+ if ((A.getRangeBegin() != 100) || (A.getRangeEnd() != 300) || (A.getRangeLength() != 200))
+ fprintf(stderr, "FAILED.\n"), exit(1);
+
+ e = 0;
+ for (int j=0; j<200; j++) {
+ fprintf(stderr, "%c", *A);
+ if (*A != 'C')
+ e++;
+ ++A;
+ }
+
+ fprintf(stderr, "\n");
+
+ if (e)
+ fprintf(stderr, "FAILED forward setRange/setPosition test: %d errors\n", e), exit(1);
+
+ // Decrease the size of our region, using the extend operators, then shift it
+ // to the right/left.
+ //
+ for (int i=0; i<190; i++)
+ A.extendLeft(-1);
+ for (int i=0; i<10; i++)
+ A.extendRight(1);
+ for (int i=0; i<10; i++)
+ A.extendLeft(-1);
+
+ fprintf(stderr, "Range: "uint32FMT"-"uint32FMT" len="uint32FMT"\n",
+ A.getRangeBegin(), A.getRangeEnd(), A.getRangeLength());
+ if ((A.getRangeBegin() != 300) || (A.getRangeEnd() != 310) || (A.getRangeLength() != 10))
+ fprintf(stderr, "FAILED.\n"), exit(1);
+
+ e = 0;
+ for (int j=0; j<20; j++) {
+ fprintf(stderr, "%c", *A);
+ if (*A != 'G')
+ e++;
+ ++A;
+ }
+ fprintf(stderr, "\n");
+
+ if (e)
+ fprintf(stderr, "FAILED reverse extendRange test: %d errors\n", e), exit(1);
+ }
+
+ {
+ fprintf(stderr, "Reverse setRange/setPosition\n");
+ FastAAccessor A(sub, 1000, true);
+ A.setRange(100, 200);
+ A.setPosition(100);
+
+ fprintf(stderr, "Range: "uint32FMT"-"uint32FMT" len="uint32FMT"\n",
+ A.getRangeBegin(), A.getRangeEnd(), A.getRangeLength());
+ if ((A.getRangeBegin() != 100) || (A.getRangeEnd() != 300) || (A.getRangeLength() != 200))
+ fprintf(stderr, "FAILED.\n"), exit(1);
+
+ e = 0;
+ for (int j=0; j<200; j++) {
+ fprintf(stderr, "%c", *A);
+ if (*A != 'G')
+ e++;
+ ++A;
+ }
+ fprintf(stderr, "\n");
+
+ if (e)
+ fprintf(stderr, "FAILED reverse setRange/setPosition test: %d errors\n", e), exit(1);
+
+ // Decrease the size of our region, using the extend operators, then shift it
+ // to the right/left.
+ //
+ for (int i=0; i<190; i++)
+ A.extendLeft(-1);
+ for (int i=0; i<10; i++)
+ A.extendRight(1);
+ for (int i=0; i<10; i++)
+ A.extendLeft(-1);
+
+ fprintf(stderr, "Range: "uint32FMT"-"uint32FMT" len="uint32FMT"\n",
+ A.getRangeBegin(), A.getRangeEnd(), A.getRangeLength());
+ if ((A.getRangeBegin() != 90) || (A.getRangeEnd() != 100) || (A.getRangeLength() != 10))
+ fprintf(stderr, "FAILED.\n"), exit(1);
+
+ e = 0;
+ for (int j=0; j<20; j++) {
+ fprintf(stderr, "%c", *A);
+ if (*A != 'T')
+ e++;
+ ++A;
+ }
+ fprintf(stderr, "\n");
+
+ if (e)
+ fprintf(stderr, "FAILED reverse extendRange test: %d errors\n", e), exit(1);
+ }
+}
+
+
+
+
+
+
+
+int
+main(int argc, char **argv) {
+ simpleTest();
+ easierTest();
+ harderTest();
+
+ fprintf(stderr, "All tests OK!\n");
+ exit(0);
+}
diff --git a/atac-driver/libatac/fasta-accessor.H b/atac-driver/libatac/fasta-accessor.H
new file mode 100644
index 0000000..353a912
--- /dev/null
+++ b/atac-driver/libatac/fasta-accessor.H
@@ -0,0 +1,213 @@
+#ifndef FASTA_ACCESSOR_H
+#define FASTA_ACCESSOR_H
+
+#include "bio++.H"
+
+
+// Define this to do bounds checking
+//
+#if 1
+#define SANITY(NAM, POS) \
+ if ((POS) > _len) { \
+ fprintf(stderr, "%s-- position "uint32FMT" larger than length "uint32FMT"\n", \
+ NAM, (POS), _len); \
+ assert((POS) <= _len); \
+ }
+#else
+#define SANITY(NAM, POS)
+#endif
+
+
+// Externally, we show the coordinate in the forward strand.
+// Internally, we represent the coordinate on the physical string.
+// The user is required to maintain the range that the
+// reverse-complement string is valid over. It is not possible to
+// randomly access sequence outside the range, but it is possible to
+// iterate over it (but then you cannot get the coordinate of where
+// you are at!)
+
+
+class FastAAccessor {
+private:
+public:
+ char *_seq;
+ uint32 _pos;
+ uint32 _len;
+
+ uint32 _rcBase;
+ uint32 _rcLen;
+
+ bool _doRevComp;
+
+private:
+ void FastAAccessorInit(char *S, uint32 length, bool revcomp) {
+ _seq = S;
+ _pos = 0;
+ _len = length;
+ if (length == 0)
+ _len = (uint32)strlen(S);
+
+ _rcBase = 0;
+ _rcLen = _len;
+
+ _doRevComp = revcomp;
+
+ if (_doRevComp)
+ _pos = _len-1;
+ };
+
+public:
+ FastAAccessor(seqInCore *S, bool revcomp=false) {
+ FastAAccessorInit(S->sequence(), S->sequenceLength(), revcomp);
+ };
+
+ FastAAccessor(char *S, uint32 length=0, bool revcomp=false) {
+ FastAAccessorInit(S, length, revcomp);
+ };
+
+private:
+
+ // Given a range in the forward string, we can reverse-complement
+ // just that range. This amounts to translating the forward string
+ // to make the beginning of the range be the origin, then
+ // reverseing the range, then translating the sequence back to the
+ // original origin.
+ //
+ uint32 rc(uint32 p) const {
+ return(_rcBase + _rcLen - (p - _rcBase) - 1);
+ };
+
+public:
+
+ // For iterating over reverse complement regions of a forward
+ // sequence.
+ //
+ // e.g., (500, 250) would be:
+ //
+ // |-----|-----|XXXXX|-----|
+ // 0 250 500 750 1000
+ //
+ // Set both to zero (also the default) to unset the range
+ //
+ // The physical location (_pos) doesn't change, but this will
+ // change the value of the corresponding forward coordinate, but
+ // not the meaning.
+ //
+ bool setRange(uint32 base=0, uint32 length=0) {
+ if ((base < _len) && (base + length <= _len)) {
+ if ((base == 0) && (length == 0)) {
+ _rcBase = 0;
+ _rcLen = _len;
+ } else {
+ _rcBase = base;
+ _rcLen = length;
+ }
+ return(true);
+ } else {
+ fprintf(stderr, "FastAAccessor::setRange()-- base="uint32FMT" and length="uint32FMT" exceed sequence length of "uint32FMT"\n",
+ base, length, _len);
+ assert(0);
+ return(false);
+ }
+ };
+
+
+ // True if this physical location is valid.
+ //
+ bool isValid(void) {
+ return(_pos < _len);
+ };
+
+
+ char operator[](uint32 p) const {
+ if ((_rcBase <= p) && (p < _rcBase + _rcLen)) {
+ if (_doRevComp) return(complementSymbol[_seq[rc(p)]]);
+ else return(_seq[p]);
+ } else {
+ fprintf(stderr, "operator[]-- Tried to access to "uint32FMT", but range is "uint32FMT"-"uint32FMT"\n",
+ p, _rcBase, _rcBase + _rcLen);
+ assert(0);
+ return(0);
+ }
+ };
+
+
+
+ // Set the accessor to some position.
+ //
+ bool setPosition(uint32 p) {
+ if ((_rcBase <= p) && (p < _rcBase + _rcLen)) {
+ if (_doRevComp) _pos = rc(p);
+ else _pos = p;
+ return(true);
+ } else {
+ fprintf(stderr, "setPosition()-- Tried to set to "uint32FMT", but range is "uint32FMT"-"uint32FMT".\n",
+ p, _rcBase, _rcBase + _rcLen);
+ assert(0);
+ return(false);
+ }
+ };
+
+ uint32 getPosition(void) {
+ if (_doRevComp) return(rc(_pos));
+ else return(_pos);
+ };
+
+ uint32 getRangeBegin(void) { return(_rcBase); };
+ uint32 getRangeEnd(void) { return(_rcBase + _rcLen); };
+ uint32 getRangeLength(void) { return(_rcLen); };
+
+ bool extendLeft(int32 x) {
+ _rcLen += x;
+ if (_doRevComp == false)
+ _rcBase -= x;
+
+ if ((_rcBase > _len) || (_rcBase + _rcLen > _len)) {
+ fprintf(stderr, "FastAAccessor::extendLeft()-- extend by "int32FMT" makes invalid: length is "uint32FMT", new range is "uint32FMT"-"uint32FMT"\n",
+ x, _len, _rcBase, _rcBase + _rcLen);
+ assert(0);
+ return(false);
+ }
+ return(true);
+ };
+
+ bool extendRight(int32 x) {
+ _rcLen += x;
+ if (_doRevComp == true)
+ _rcBase -= x;
+
+ if ((_rcBase > _len) || (_rcBase + _rcLen > _len)) {
+ fprintf(stderr, "FastAAccessor::extendRight()-- extend by "int32FMT" makes invalid: length is "uint32FMT", new range is "uint32FMT"-"uint32FMT"\n",
+ x, _len, _rcBase, _rcBase + _rcLen);
+ assert(0);
+ return(false);
+ }
+ return(true);
+ };
+
+ char operator*(void) const {
+ SANITY("FastAAccessor::operator*()", _pos);
+ if (_doRevComp) return(complementSymbol[_seq[_pos]]);
+ else return(_seq[_pos]);
+ };
+
+ char get(void) const {
+ SANITY("FastAAccessor::get()", _pos);
+ if (_doRevComp) return(complementSymbol[_seq[_pos]]);
+ else return(_seq[_pos]);
+ };
+
+ FastAAccessor &operator--(void) {
+ if (_doRevComp) _pos++;
+ else _pos--;
+ return(*this);
+ };
+
+ FastAAccessor &operator++(void) {
+ if (_doRevComp) _pos--;
+ else _pos++;
+ return(*this);
+ };
+};
+
+#endif // FASTA_ACCESSOR_H
diff --git a/atac-driver/makeplot.pl b/atac-driver/makeplot.pl
new file mode 100644
index 0000000..574a54b
--- /dev/null
+++ b/atac-driver/makeplot.pl
@@ -0,0 +1,430 @@
+#!/usr/bin/env perl
+#
+# This file is part of A2Amapper.
+# Copyright (c) 2008-2009 J. Craig Venter Institute
+# Author: Brian Walenz
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received (LICENSE.txt) a copy of the GNU General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#
+# The reference MUST by the first sequence, and it MUST be a single sequence.
+#
+
+use strict;
+use FindBin;
+
+my $mm = shift @ARGV;
+my $in = shift @ARGV;
+my $ot = shift @ARGV;
+
+if (($mm ne "u") && ($mm ne "r")) {
+ die "First arg must be 'u' (ungapped matches) or 'r' (runs).\n";
+}
+
+if (!defined($ot) && ($in =~ m/(.*).atac/)) {
+ $ot = $1;
+}
+if ($ot =~ m/^(.*).png/) {
+ $ot = $1;
+}
+if ($ot =~ m/^(.*).ps/) {
+ $ot = $1;
+}
+
+die if (!defined($in));
+die if (!defined($ot));
+
+my $version = `gnuplot -V`;
+
+if ($version =~ m/gnuplot\s+(\d+\.\d+)\s+/) {
+ $version = $1;
+} else {
+ chomp $version;
+ print STDERR "WARNING: Unknown gnuplot version '$version'\n";
+ $version = 0;
+}
+
+if ($version < 4.2) {
+ print STDERR "gnuplot version 4.2 is needed for plots.\n";
+ exit(0);
+}
+
+open(FD, "> $ot.fdat");
+open(RD, "> $ot.rdat");
+open(GP, "> $ot.gp");
+
+print GP "set size 1,1\n";
+print GP "set grid\n";
+print GP "unset key\n";
+print GP "set border 10\n";
+print GP "set tics scale 0\n";
+print GP "set xlabel \"REF\"\n";
+print GP "set ylabel \"ASM\"\n";
+print GP "set title \"$ot\"\n";
+print GP "set format \"%.0f\"\n";
+#print GP "set mouse format \"%.0f\"\n";
+#print GP "set mouse mouseformat \"[%.0f, %.0f]\"\n";
+#print GP "set mouse clipboardformat \"[%.0f, %.0f]\"\n";
+print GP "set style line 1 lt 1 lw 1 pt 6 ps 1\n";
+print GP "set style line 2 lt 3 lw 1 pt 6 ps 1\n";
+print GP "set style line 3 lt 2 lw 1 pt 6 ps 1\n";
+
+# We need to know the length of the reference so we can cycle
+# the coordinates
+#
+my $refLength = 0;
+
+# And we need to know the lengths of the scaffolds in the assembly.
+#
+my $asmFile1;
+my $asmId1;
+my $asmFile2;
+my $asmId2;
+
+open(IN, "< $in") or die;
+while (<IN>) {
+ if (m/assemblyId1=(.*)/) {
+ $asmId1 = $1;
+ }
+ if (m/assemblyFile1=(.*)/) {
+ $asmFile1 = $1;
+ }
+ if (m/assemblyFile2=(.*)/) {
+ $asmFile2 = $1;
+ }
+ if (m/assemblyId2=(.*)/) {
+ $asmId2 = $1;
+ }
+}
+close(IN);
+
+
+# Figure out which scaffolds are reversed.
+
+my %reversed;
+my %goofed;
+
+{
+ open(IN, "< $in") or die;
+ while (<IN>) {
+ my @v = split '\s+', $_;
+ if (($v[0] eq "M") && ($v[1] eq $mm)) {
+ $reversed{$v[8]} += $v[10] * $v[11]; # length * orientation
+
+ # Remember if there was a goofy inversion in this
+ # scaffold. First time through we remember the
+ # orientation of the first match, then later matches
+ # check it is the same, reseting to a token value (2) if
+ # they differ.
+
+ if (!defined($goofed{$v[8]})) {
+ $goofed{$v[8]} = $v[11];
+ } else {
+ if ($goofed{$v[8]} != $v[11]) {
+ $goofed{$v[8]} = 2;
+ }
+ }
+ }
+ }
+ close(IN);
+ foreach my $k (keys %reversed) {
+ if ($reversed{$k} < 0) {
+ $reversed{$k} = 1;
+ } else {
+ $reversed{$k} = 0;
+ }
+ if ($goofed{$k} == 2) {
+ $goofed{$k} = 1;
+ } else {
+ $goofed{$k} = 0;
+ }
+ }
+}
+
+
+
+# Find the reference length
+
+my $refLength;
+
+open(F, "< $asmFile1") or die "Failed to open genome reference '$asmFile1'\n";
+$_ = <F>; # defline
+while (<F>) {
+ s/^\s+//;
+ s/\s+$//;
+ $refLength += length ($_);
+}
+close(F);
+
+
+# Find the assembly lengths -- ignore anything without a match.
+
+my %asmLength;
+my $asmLength;
+
+open(F, "$FindBin::Bin/leaff -F $asmFile2 -i $asmId2 |");
+while (<F>) {
+ my @v = split '\s+', $_;
+ if ($v[0] eq "G") {
+ $asmLength{$v[2]} = $v[3] if (exists($reversed{$v[2]}));
+ $asmLength += $v[3];
+ }
+}
+close(F);
+
+print GP "set xrange [0:", $refLength+1, "]\n";
+print GP "set yrange [0:", $asmLength+1, "]\n";
+
+
+# Figure out where to split the reference - we shift both X and Y, so
+# we arbitrarily pick any assembly sequence and anchor it at the
+# origin. Well, not arbitrary. We pick the longest only so that
+# we're pretty sure we didn't pick some crappy tiny contig.
+
+my $refSplit = 0;
+
+{
+ my $asmAnchorSequence;
+ my $minAsm = 999999999;
+
+ # Pick the longest sequence as the anchor -- skipping this block
+ # will instead pick the first thing in the file.
+ #
+ # Don't pick anything with reversed crap on the ends; this
+ # greatly screws up.
+ #
+ foreach my $k (keys %asmLength) {
+ if ($goofed{$k}) {
+ next;
+ }
+ if (!defined($asmAnchorSequence) || ($asmLength{$asmAnchorSequence} < $asmLength{$k})) {
+ $asmAnchorSequence = $k;
+ }
+ }
+
+ # Except when all scaffolds are goofed. *sigh*
+
+ if (!defined($asmAnchorSequence)) {
+ foreach my $k (keys %asmLength) {
+ if (!defined($asmAnchorSequence) || ($asmLength{$asmAnchorSequence} < $asmLength{$k})) {
+ $asmAnchorSequence = $k;
+ }
+ }
+ }
+
+ open(IN, "< $in") or die;
+ while (<IN>) {
+ my @v = split '\s+', $_;
+
+ if (($v[0] eq "M") && ($v[1] eq $mm)) {
+ if ($reversed{$v[8]}) {
+ $v[9] = $asmLength{$v[8]} - ($v[9] + $v[10]);
+ $v[11] *= -1;
+ }
+
+ if (!defined($asmAnchorSequence)) {
+ $asmAnchorSequence = $v[8];
+ }
+
+ if ($v[8] eq $asmAnchorSequence) {
+ if ($v[9] < $minAsm) {
+ $minAsm = $v[9];
+ $refSplit = $v[5];
+ }
+ }
+ }
+ }
+ close(IN);
+}
+
+# Figure out how to place the assembly sequences
+#
+# $refSplit controls where we shift the reference origin.
+# %offsetRef controls where we place the assembly in the Y axis.
+#
+# We want to rotate the reference so the largest scaffold is placed
+# at the origin.
+
+my %offsetRef;
+
+{
+ my %minY;
+ my %minYbackup;
+
+ open(IN, "< $in") or die;
+ while (<IN>) {
+ my @v = split '\s+', $_;
+ if (($v[0] eq "M") && ($v[1] eq $mm)) {
+ if ($reversed{$v[8]}) {
+ $v[9] = $asmLength{$v[8]} - ($v[9] + $v[10]);
+ $v[11] *= -1;
+ }
+
+ # Confusing. Rotate the reference coordinate, then
+ # remember the smallest for each scaffold.
+
+ $v[5] -= $refSplit;
+ $v[5] += $refLength if ($v[5] < 0);
+
+ my $d = $v[5];
+
+ # Ignore if this is a tiny crappy little thing. This
+ # allows us to place most of the real matches on the
+ # diagonal, showing obvious small chimers.
+
+ if ($v[10] >= 2000) {
+ if (!exists($minY{$v[8]}) || ($d < $minY{$v[8]})) {
+ $minY{$v[8]} = $d;
+ }
+ } else {
+ if (!exists($minY{$v[8]}) || ($d < $minYbackup{$v[8]})) {
+ $minYbackup{$v[8]} = $d;
+ }
+ }
+ }
+ }
+ close(IN);
+
+ # If we never found a large block, use the biggest we did find.
+
+ foreach my $k (keys %minYbackup) {
+ if (!exists($minY{$k})) {
+ $minY{$k} = $minYbackup{$k};
+ }
+ }
+
+ my @sortme;
+ my $lengthsum = 0;
+ foreach my $k (keys %minY) {
+ push @sortme, "$minY{$k}\0$k";
+ }
+ @sortme = sort { $a <=> $b } @sortme;
+ foreach my $v (@sortme) {
+ my ($p, $k) = split '\0', $v;
+ $offsetRef{$k} = $lengthsum;
+ $lengthsum += $asmLength{$k};
+ }
+}
+
+print GP "set ytics ( \\\n";
+
+{
+ # Ugh, gross.
+ my @keys = keys %offsetRef;
+ while (scalar(@keys)) {
+ my $k = shift @keys;
+
+ if (scalar(@keys) > 0) {
+ print GP " \"$k\" $offsetRef{$k},\\\n";
+ } else {
+ print GP " \"$k\" $offsetRef{$k}\\\n";
+ }
+ }
+}
+print GP ")\n";
+
+print GP "set xtics ( \\\n";
+#for (my $p=500000; $p<$refLength; $p += 500000) {
+# my $i = $p - $refSplit;
+# if ($i < 0) {
+# $i += $refLength;
+# }
+# print GP " \"$p\" $i,\\\n";
+#}
+print GP " \"origin\" ", $refLength - $refSplit, "\\\n";
+print GP ")\n";
+
+
+
+
+my $hasFdat = 0;
+my $hasRdat = 0;
+
+open(IN, "< $in") or die;
+while (<IN>) {
+ my @v = split '\s+', $_;
+
+ if (($v[0] eq "M") && ($v[1] eq $mm)) {
+ if ($reversed{$v[8]}) {
+ $v[9] = $asmLength{$v[8]} - ($v[9] + $v[10]);
+ $v[11] *= -1;
+ }
+
+ my $abeg = $v[5];
+ my $aend = $v[5] + $v[6];
+ my $bbeg = $v[9];
+ my $bend = $v[9] + $v[10];
+
+ $abeg -= $refSplit;
+ $aend -= $refSplit;
+
+ if (($abeg < 0) || ($aend < 0)) {
+ $abeg += $refLength;
+ $aend += $refLength;
+ }
+
+ $bbeg += $offsetRef{$v[8]};
+ $bend += $offsetRef{$v[8]};
+
+ if ($v[11] == 1) {
+ $hasFdat++;
+ print FD "$abeg $bbeg\n";
+ print FD "$aend $bend\n";
+ print FD "\n\n";
+ } else {
+ $hasRdat++;
+ print RD "$abeg $bend\n";
+ print RD "$aend $bbeg\n";
+ print RD "\n\n";
+ }
+ }
+}
+close(IN);
+
+close(FD);
+close(RD);
+
+print GP "set terminal png tiny size 800,800\n";
+print GP "set output \"$ot.png\"\n";
+
+if ($hasFdat && $hasRdat) {
+ print GP "plot \\\n";
+ print GP " \"$ot.fdat\" w lp ls 1, \\\n";
+ print GP " \"$ot.rdat\" w lp ls 2\n";
+ #print GP "pause -1\n";
+} elsif ($hasFdat) {
+ print GP "plot \\\n";
+ print GP " \"$ot.fdat\" w lp ls 1\n";
+ #print GP "pause -1\n";
+} elsif ($hasRdat) {
+ print GP "plot \\\n";
+ print GP " \"$ot.rdat\" w lp ls 2\n";
+ #print GP "pause -1\n";
+} else {
+ # No matches??
+ #die;
+}
+
+print GP "set terminal postscript color\n";
+print GP "set output \"$ot.ps\"\n";
+print GP "replot\n";
+
+close(GP);
+
+system("gnuplot $ot.gp");
+
+#unlink "$ot.fdat";
+#unlink "$ot.rdat";
+#unlink "$ot.gp";
diff --git a/atac-driver/matchExtender/Make.include b/atac-driver/matchExtender/Make.include
new file mode 100644
index 0000000..0a60e5c
--- /dev/null
+++ b/atac-driver/matchExtender/Make.include
@@ -0,0 +1,16 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBSEQ/ :=$(realpath $/../../libseq/)/
+LIBATAC/ :=$(realpath $/../libatac/)/
+
+$/.CXX_SRCS := $/matchExtender.C $/matchExtender-dump.C $/matchExtender-func.C
+$/.CXX_EXES := $/matchExtender
+
+$/.CLEAN :=$/*.o $/*~ $/core
+
+$/matchExtender: $/matchExtender.o $/matchExtender-dump.o $/matchExtender-func.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBATAC/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/atac-driver/matchExtender/match.H b/atac-driver/matchExtender/match.H
new file mode 100644
index 0000000..552ef1f
--- /dev/null
+++ b/atac-driver/matchExtender/match.H
@@ -0,0 +1,127 @@
+#ifndef MATCH_H
+#define MATCH_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "atac.H"
+#include "bio++.H"
+#include "seqCache.H"
+
+class match_s {
+public:
+ char _matchId[32];
+
+ seqInCore *_seq1;
+ FastAAccessor *_acc1;
+ uint32 _iid1;
+ uint32 _pos1;
+ uint32 _len1;
+ uint32 _ori1;
+
+ seqInCore *_seq2;
+ FastAAccessor *_acc2;
+ uint32 _iid2;
+ uint32 _pos2;
+ uint32 _len2;
+ uint32 _ori2;
+
+ bool _isDeleted;
+
+ // Our diagonal never, ever changes
+ uint32 _diagonal;
+
+public:
+ match_s(char *matchId,
+ seqInCore *s1, uint32 i1, uint32 p1, uint32 l1, uint32 o1,
+ seqInCore *s2, uint32 i2, uint32 p2, uint32 l2, uint32 o2) {
+
+ strncpy(_matchId, matchId, 32);
+
+ _seq1 = s1;
+ _acc1 = new FastAAccessor(_seq1, false);
+ _iid1 = i1;
+ _pos1 = p1;
+ _len1 = l1;
+ _ori1 = o1;
+
+ _seq2 = s2;
+ _acc2 = new FastAAccessor(_seq2, (o1 != o2));
+ _iid2 = i2;
+ _pos2 = p2;
+ _len2 = l2;
+ _ori2 = o2;
+
+ _isDeleted = false;
+
+ _acc1->setRange(_pos1, _len1);
+ _acc2->setRange(_pos2, _len2);
+
+ _acc1->setPosition(_pos1);
+ _acc2->setPosition(_pos2);
+
+ // the diagonal is....
+ if (_ori1 == _ori2)
+ _diagonal = _seq1->sequenceLength() - _pos1 + _pos2;
+ else
+ _diagonal = _seq1->sequenceLength() - _pos1 + _seq2->sequenceLength() - (_pos2 + _len2);
+ };
+
+ ~match_s() {
+ delete _acc1;
+ delete _acc2;
+ };
+
+
+ // Compare by diagonal, then by position in the first sequence.
+ //
+ bool operator<(const match_s& r) const {
+ if (_diagonal < r._diagonal) return(true);
+ if (_diagonal == r._diagonal) return(_pos1 < r._pos1);
+ return(false);
+ };
+
+ void dump(FILE *out, const char *descr, bool showSeq=false);
+
+
+ // For compatibility
+
+ void extendLeft(int32 num) {
+ _acc1->extendLeft(num);
+ _acc2->extendLeft(num);
+ };
+
+ void extendRight(int32 num) {
+ _acc1->extendRight(num);
+ _acc2->extendRight(num);
+ };
+
+ bool isDeleted(void) { return(_isDeleted); };
+ void setDeleted(void) { _isDeleted = true; };
+
+
+ // Since we're on the same diagonal, and ungapped, the choice of
+ // testing sequence 1 or 2 is arbirary.
+ //
+ bool canMergeWith(match_s *m) {
+ return((m != 0L) &&
+ (_diagonal == m->_diagonal) &&
+ (_acc1->getRangeEnd() >= m->_acc1->getRangeBegin()));
+ };
+
+ // Extend us to end where m ends
+ //
+ void consume(match_s *m) {
+ if (m->_acc1->getRangeEnd() > _acc1->getRangeEnd())
+ extendRight(m->_acc1->getRangeEnd() - _acc1->getRangeEnd());
+ };
+
+ uint32 len(void) { return(_acc1->getRangeEnd() - _acc1->getRangeBegin()); };
+ uint32 pos1(void) { return(_acc1->getRangeBegin()); };
+ uint32 pos2(void) { return(_acc2->getRangeBegin()); };
+ seqInCore *seq1(void) { return(_seq1); };
+ seqInCore *seq2(void) { return(_seq2); };
+};
+
+#endif // MATCH_H
diff --git a/atac-driver/matchExtender/matchExtender-dump.C b/atac-driver/matchExtender/matchExtender-dump.C
new file mode 100644
index 0000000..3611688
--- /dev/null
+++ b/atac-driver/matchExtender/matchExtender-dump.C
@@ -0,0 +1,125 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applied Biosystems
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Dan Fasulo
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "match.H"
+
+
+void
+match_s::dump(FILE *out, const char *descr, bool showSeq) {
+ fprintf(out, "%s: ID:%s range1:"uint32FMT","uint32FMT" _pos="uint32FMT" (seqlen="uint32FMT")\n",
+ descr, _matchId,
+ _acc1->getRangeBegin(), _acc1->getRangeLength(), _acc1->_pos, _seq1->sequenceLength());
+ fprintf(out, "%s ID:%s range2:"uint32FMT","uint32FMT" _pos="uint32FMT" (seqlen="uint32FMT") diag:"uint32FMT" %s\n",
+ descr, _matchId,
+ _acc2->getRangeBegin(), _acc2->getRangeLength(), _acc2->_pos, _seq2->sequenceLength(),
+ _diagonal, (_ori1 != _ori2) ? "reversed" : "");
+
+ if (showSeq) {
+ FastAAccessor &A = *_acc1;
+ FastAAccessor &B = *_acc2;
+
+ // Save the position of the accessors
+ //
+ uint32 acc1pos = A._pos;
+ uint32 acc2pos = B._pos;
+
+ A.setPosition(A.getRangeBegin());
+ B.setPosition(B.getRangeBegin());
+
+ uint32 margin = 5;
+ uint32 i = 0;
+ char *seq = new char [_acc1->getRangeEnd() - _acc1->getRangeBegin() + margin + margin + 32];
+ char *las = seq;
+
+ strcpy(seq, ">>> ");
+ while (*las) las++;
+
+ for (i=0; i<margin; i++)
+ --A;
+ for (i=0; i<margin; i++, ++A)
+ if (A.isValid())
+ *las++ = *A;
+ else
+ *las++ = ' ';
+ *las++ = ':';
+ for (i=0; i<_acc1->getRangeEnd() - _acc1->getRangeBegin(); i++, ++A)
+ if (A.isValid())
+ *las++ = *A;
+ else
+ *las++ = ' ';
+ *las++ = ':';
+ for (i=0; i<margin; i++, ++A)
+ if (A.isValid())
+ *las++ = *A;
+ else
+ *las++ = ' ';
+
+ *las++ = 0;
+ fprintf(out, "%s\n", seq);
+
+ las = seq;
+
+ strcpy(seq, (_ori1 == _ori2) ? ">>> " : "<<< ");
+ while (*las) las++;
+
+ for (i=0; i<margin; i++)
+ --B;
+ for (i=0; i<margin; i++, ++B)
+ if (B.isValid())
+ *las++ = *B;
+ else
+ *las++ = ' ';
+ *las++ = ':';
+ for (i=0; i<_acc1->getRangeEnd() - _acc1->getRangeBegin(); i++, ++B)
+ if (B.isValid())
+ *las++ = *B;
+ else
+ *las++ = ' ';
+ *las++ = ':';
+ for (i=0; i<margin; i++, ++B)
+ if (B.isValid())
+ *las++ = *B;
+ else
+ *las++ = ' ';
+
+ *las++ = 0;
+ fprintf(out, "%s\n", seq);
+
+ delete [] seq;
+
+ // Restore positions
+ _acc1->_pos = acc1pos;
+ _acc2->_pos = acc2pos;
+ }
+}
+
+
+
+
+
+
+
+
+
diff --git a/atac-driver/matchExtender/matchExtender-func.C b/atac-driver/matchExtender/matchExtender-func.C
new file mode 100644
index 0000000..0de2058
--- /dev/null
+++ b/atac-driver/matchExtender/matchExtender-func.C
@@ -0,0 +1,551 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <vector>
+#include <algorithm>
+
+#include "bio++.H"
+#include "atac.H"
+#include "match.H"
+
+using namespace std;
+
+
+extern uint32 minEndRunLen;
+extern uint32 maxMMBlock;
+extern uint32 minBlockSep;
+extern double minIdentity;
+extern uint32 maxNbrSep;
+extern uint32 maxNbrPathMM;
+
+
+//#define DEBUG_TRACE
+//#define DEBUG_TRIMTOPERCENT
+//#define DEBUG_EXTEND
+//#define DEBUG_EXTEND_CONSUME
+//#define DEBUG_EXTEND_BACK
+//#define DEBUG_EXTEND_FORWARD
+
+
+
+// Return true if c1 and c2 are identities, false otherwise.
+//
+bool
+isIdentity(char c1, char c2) {
+ return((letterToBits[(int)c1] != 0xff) &&
+ (letterToBits[(int)c2] != 0xff) &&
+ IUPACidentity[(int)c1][(int)c2]);
+}
+
+
+
+// Finds the largest block >= 'pct' (95%) identity.
+//
+bool
+trim_to_pct(vector<match_s *>& matches, uint32 midx, double pct) {
+#ifdef DEBUG_TRACE
+ fprintf(stderr, "trim_to_pct()\n");
+#endif
+
+ uint32 best_start = 0;
+ uint32 best_len = 0;
+ match_s *m = matches[midx];
+
+ FastAAccessor &A = *m->_acc1;
+ FastAAccessor &B = *m->_acc2;
+
+ A.setPosition(m->pos1());
+ B.setPosition(m->pos2());
+
+#ifdef DEBUG_TRIMTOPERCENT
+ //m->dump(stderr, "TrimToPercent", false);
+#endif
+
+ // For all starting positions:
+ //
+ // We could short-circuit here - once (m->len() - start) becomes
+ // shorter than our best_len, we have no hope in finding a better
+ // one.
+ //
+ for (uint32 start=0;
+ (start< m->len()) && (m->len() - start > best_len);
+ ++start) {
+ uint32 best_run_len = 0;
+ uint32 sum = 0;
+
+ A.setPosition(m->pos1() + start);
+ B.setPosition(m->pos2() + start);
+
+ // And all ending positions:
+ //
+ // Compute the number of identities we've seen, and remember the
+ // length of the highest identity.
+ //
+ for (uint32 len = 1; start + len <= m->len(); ++len) {
+ char c1 = *A;
+ char c2 = *B;
+
+ // We just extend the last result by one, rather than recompute
+ // the whole value for our new range (start, len).
+ //
+ if (isIdentity(c1, c2))
+ sum++;
+
+ // If the sum is more than 'pct' identities, we are by
+ // construction the longest run at this starting point, so
+ // remember it.
+ //
+ if (sum >= pct * len)
+ best_run_len = len;
+
+ ++A;
+ ++B;
+ }
+
+ // Special case: if the whole string is okay, don't check any
+ // subranges
+ //
+ if ((start == 0) && (best_run_len == m->len()))
+ return(false);
+
+ // If we've just found a longer subrange, remember it.
+ //
+ if (best_run_len > best_len) {
+ best_start = start;
+ best_len = best_run_len;
+ }
+ }
+
+ if (best_len < m->len()) {
+#ifdef DEBUG_TRIMTOPERCENT
+ fprintf(stderr, "============================================================\n");
+ fprintf(stderr, "Trimming to substring with start="uint32FMT" and len="uint32FMT" for percent identity\n",
+ best_start, best_len);
+ m->dump(stderr, "BEFORE", true);
+#endif
+
+ m->extendLeft(-(int32)best_start);
+ m->extendRight(-(int32)(m->len() - best_len));
+
+#ifdef DEBUG_TRIMTOPERCENT
+ m->dump(stderr, "AFTER", true);
+ fprintf(stderr, "============================================================\n");
+#endif
+
+ return(true);
+ }
+
+ return(false);
+}
+
+
+void
+extend_match_backward(vector<match_s *>& matches,
+ uint32 midx,
+ uint32 min_start_pos) {
+#ifdef DEBUG_TRACE
+ fprintf(stderr, "extend_match_backward()-- min_start_pos="uint32FMT"\n", min_start_pos);
+#endif
+
+ // Assumes when traveling backwards that we will never run into
+ // another match (otherwise, that match would have been forward
+ // extended previously).
+
+ uint32 num_recent_mismatches = 0;
+ match_s *m = matches[midx];
+ uint32 good_run_len = (int) m->len();
+ uint32 num_pending = 0;
+
+ FastAAccessor &A = *m->_acc1;
+ FastAAccessor &B = *m->_acc2;
+
+ A.setPosition(m->_acc1->getRangeBegin());
+ B.setPosition(m->_acc2->getRangeBegin());
+
+ // Decrement, instead of subtract one from the position above, to
+ // avoid any issues with overflow (e.g., 0 - 1).
+ //
+ --A;
+ --B;
+
+ while ((A.getPosition() > min_start_pos) && A.isValid() && B.isValid()) {
+ char c1 = *A;
+ char c2 = *B;
+
+ if (isIdentity(c1, c2)) {
+ good_run_len++;
+
+ // If we've gone long enough, erase our mismatch record
+ //
+ if (good_run_len == minBlockSep) // 20 by default
+ num_recent_mismatches = 0;
+
+ // If we're in the middle of a long good run, add the character
+ // to the match (END_RUN_LEN=10)
+ //
+ // Otherwise, if we just made the minimum extension length, add
+ // all of the pending characters.
+ //
+ // Otherwise, this character is pending. However, still do
+ // output if we're run out of sequence.
+ //
+ if (good_run_len > minEndRunLen) {
+ m->extendLeft(1);
+ } else if (good_run_len == minEndRunLen) {
+ m->extendLeft(num_pending + 1);
+ num_pending = 0;
+ } else {
+ num_pending++;
+ }
+ } else {
+ good_run_len = 0;
+ num_pending++;
+ num_recent_mismatches++;
+ if (num_recent_mismatches > maxMMBlock) // 3 by default
+ break;
+ }
+
+ --A;
+ --B;
+ }
+
+ // If we hit the end of the sequence, and are good, do extension
+ //
+ // if ((A.getPosition() == min_start_pos) || (B.getPosition() == 0))
+ //
+ if (!A.isValid() || !B.isValid() || (A.getPosition() <= min_start_pos))
+ m->extendLeft(num_pending);
+
+
+#ifdef DEBUG_EXTEND_BACK
+ fprintf(stderr, "extend_back()-- M u %s . %s %d %d 1 %s %d %d 1\n",
+ m->_matchId,
+ m->_id1, m->_acc1->getRangeBegin(), m->_acc1->getRangeLength(),
+ m->_id2, m->_acc2->getRangeBegin(), m->_acc2->getRangeLength());
+#endif
+}
+
+
+bool
+can_reach_nearby_match(match_s *src, match_s *dest) {
+#ifdef DEBUG_TRACE
+ fprintf(stderr, "can_reach_nearby_match()\n");
+#endif
+
+ if (dest->pos1() - (src->pos1() + src->len()) > (uint32) maxNbrSep) // 100
+ return false;
+
+#if 0
+ src->dump(stderr, "src:");
+ dest->dump(stderr, "dst:");
+#endif
+
+ FastAAccessor &A = *src->_acc1;
+ FastAAccessor &B = *src->_acc2;
+
+ A.setPosition(A.getRangeEnd() - 1);
+ B.setPosition(B.getRangeEnd() - 1);
+
+ ++A;
+ ++B;
+
+ uint32 num_mismatch = 0;
+
+ while ((num_mismatch <= maxNbrPathMM) && // 5
+ (A.getPosition() < dest->pos1()) &&
+ (A.isValid()) &&
+ (B.isValid())) {
+ if (!isIdentity(*A, *B))
+ num_mismatch++;
+
+ ++A;
+ ++B;
+ }
+
+#if 0
+ fprintf(stderr, "num_mismatch=%d pos: %d %d valid: A:%d B:%d\n",
+ num_mismatch, A.getPosition(), dest->pos1(), A.isValid(), B.isValid());
+#endif
+
+ return(num_mismatch <= maxNbrPathMM); // 5
+}
+
+
+
+// Stops and returns true if we hit the next match
+//
+bool
+extend_match_forward(vector<match_s *>& matches, uint32 midx, match_s *target) {
+#ifdef DEBUG_TRACE
+ fprintf(stderr, "extend_match_forward()\n");
+#endif
+
+ match_s *m = matches[midx];
+ uint32 num_recent_mismatches = 0;
+ uint32 num_pending = 0;
+
+ uint32 good_run_len = (int) m->len();
+
+ FastAAccessor &A = *m->_acc1;
+ FastAAccessor &B = *m->_acc2;
+
+#ifdef DEBUG_EXTEND_FORWARD
+ fprintf(stderr, "extend_match_forward()-- A:%4d-%4d B:%4d-%4d\n",
+ A.getRangeBegin(), A.getRangeLength(),
+ B.getRangeBegin(), B.getRangeLength());
+#endif
+
+ // Set our position to the last valid base in the range, then move
+ // to the next one.
+ //
+ A.setPosition(A.getRangeEnd() - 1);
+ B.setPosition(B.getRangeEnd() - 1);
+
+ ++A;
+ ++B;
+
+ while (A.isValid() && B.isValid()) {
+ char c1 = *A;
+ char c2 = *B;
+
+ if (isIdentity(c1, c2)) {
+ good_run_len++;
+
+ //fprintf(stderr, "extend-forward %c %c\n", c1, c2);
+
+ // Pass Go and collect $200
+ //
+ if (good_run_len == minBlockSep)
+ num_recent_mismatches = 0;
+
+ // If not enough good characters yet, increase the length
+ // pending. We used to check for the hitting the end of the
+ // sequence here.
+ //
+ // Otherwise, if we have just made the minumum good run length,
+ // do the extension.
+ //
+ // Otherwise, if we're above the minimum good length, extend by
+ // another character.
+ //
+ if (good_run_len < minEndRunLen) {
+ num_pending++;
+ } else if (good_run_len == minEndRunLen) {
+ m->extendRight(num_pending + 1);
+ num_pending = 0;
+ } else if (good_run_len > minEndRunLen) {
+ m->extendRight(1);
+ }
+
+ // If we've run into (and possibly over) another seed match,
+ // return so the main loop can consume and restart.
+ //
+ if (m->canMergeWith(target))
+ return(true);
+ } else {
+ good_run_len = 0;
+ num_pending++;
+ num_recent_mismatches++;
+
+ if (num_recent_mismatches > maxMMBlock)
+ return(false);
+ }
+
+ ++A;
+ ++B;
+ }
+
+ // If we've got a short good run but have hit the end of
+ // a sequence, do extension.
+ //
+ if ((!A.isValid() || !B.isValid()) && (good_run_len < minEndRunLen))
+ m->extendRight(num_pending);
+
+#ifdef DEBUG_EXTEND_FORWARD
+ fprintf(stderr, "extend_match_forward(finish)-- A:%4d-%4d B:%4d-%4d\n",
+ A.getRangeBegin(), A.getRangeLength(),
+ B.getRangeBegin(), B.getRangeLength());
+#endif
+
+ return(false);
+}
+
+
+
+
+
+
+
+uint32
+extend_matches_on_diagonal(vector<match_s *>& matches, uint32 diag_start) {
+#ifdef DEBUG_TRACE
+ fprintf(stderr, "extend_matches_on_diagonal()\n");
+#endif
+
+ uint32 diag_id = matches[diag_start]->_diagonal;
+ uint32 idx;
+ uint32 prev_end = 0;
+ match_s *m;
+ match_s *next_m = NULL;
+
+ // Back extend each match as far as possible (but never over the
+ // preceding match
+ //
+ for (idx = diag_start;
+ (idx < matches.size()) && (matches[idx]->_diagonal == diag_id);
+ ++idx) {
+
+ m = matches[idx];
+
+#ifdef DEBUG_EXTEND_BACK
+ m->dump(stderr, "Before back extension:", true);
+#endif
+
+ extend_match_backward(matches, idx, prev_end);
+
+#ifdef DEBUG_EXTEND_BACK
+ m->dump(stderr, "After back extension:", true);
+#endif
+
+#ifdef DEBUG_EXTEND
+ fprintf(stderr, "1M u %s . %s %d %d 1 %s %d %d 1\n",
+ matches[idx]->_matchId,
+ matches[idx]->_id1, matches[idx]->_acc1->getRangeBegin(), matches[idx]->_acc1->getRangeLength(),
+ matches[idx]->_id2, matches[idx]->_acc2->getRangeBegin(), matches[idx]->_acc2->getRangeLength());
+#endif
+
+ prev_end = m->pos1() + m->len();
+
+ if ((m->pos1() > m->seq1()->sequenceLength()) || (m->pos2() > m->seq2()->sequenceLength()))
+ m->dump(stderr, "NEGATIVE after back extend!\n", true), abort();
+ }
+
+
+ // Now forward extend each match
+
+
+ idx = diag_start;
+ while ((idx < matches.size()) &&
+ (matches[idx]->_diagonal == diag_id)) {
+
+ if (matches[idx]->isDeleted()) {
+ idx++;
+ continue;
+ }
+
+#ifdef DEBUG_EXTEND
+ fprintf(stderr, "2M u %s . %s %d %d 1 %s %d %d 1\n",
+ matches[idx]->_matchId,
+ matches[idx]->_id1, matches[idx]->_acc1->getRangeBegin(), matches[idx]->_acc1->getRangeLength(),
+ matches[idx]->_id2, matches[idx]->_acc2->getRangeBegin(), matches[idx]->_acc2->getRangeLength());
+#endif
+
+ m = matches[idx];
+ next_m = 0L;
+
+ for (uint32 next_idx=idx+1; ((next_idx < matches.size()) &&
+ (matches[next_idx]->_diagonal == diag_id) &&
+ (next_m == 0L)); next_idx++)
+ if (matches[next_idx]->isDeleted() == false)
+ next_m = matches[next_idx];
+
+ // First, try to reach the next match with the simple "maximum of
+ // k mismatches" rule. If we made it, consume the next match and
+ // start the loop again with the same match (now extended)
+ //
+ if (next_m && can_reach_nearby_match(m, next_m)) {
+#ifdef DEBUG_EXTEND_CONSUME
+ m->dump(stderr, "I can_reach_nearby_match and extend this", true);
+ next_m->dump(stderr, "with this", true);
+#endif
+ m->consume(next_m);
+ next_m->setDeleted();
+#ifdef DEBUG_EXTEND_CONSUME
+ m->dump(stderr, "Extended through next match via neighbor search:", true);
+#endif
+ continue;
+ }
+
+ // Otherwise, try to make it to the next match with the
+ // character-at- a-time extension rules. If we make it, restart
+ // the loop with the same match (now extended). Otherwise, trim
+ // the extended match as necessary and move on to the next
+ // match.
+ //
+ if (extend_match_forward(matches, idx, next_m)) {
+#ifdef DEBUG_EXTEND_CONSUME
+ m->dump(stderr, "I extend_match_forward and extend this", true);
+ next_m->dump(stderr, "with this", true);
+#endif
+ m->consume(next_m);
+ next_m->setDeleted();
+#ifdef DEBUG_EXTEND_CONSUME
+ m->dump(stderr, "Extended through next match via forward extension:", true);
+#endif
+ continue;
+ }
+
+#ifdef DEBUG_EXTEND
+ //m->dump(stderr, "Failed to make next match. Final extended version:", true);
+#endif
+
+#ifdef DEBUG_EXTEND
+ fprintf(stderr, "3M u %s . %s %d %d 1 %s %d %d 1\n",
+ matches[idx]->_matchId,
+ matches[idx]->_id1, matches[idx]->_acc1->getRangeBegin(), matches[idx]->_acc1->getRangeLength(),
+ matches[idx]->_id2, matches[idx]->_acc2->getRangeBegin(), matches[idx]->_acc2->getRangeLength());
+#endif
+
+ // Didn't make it, so trim and move on
+ //
+ if (trim_to_pct(matches, idx, minIdentity)) {
+#ifdef DEBUG_EXTEND_TRIMMING
+ m->dump(stderr, "After trimming:", true);
+#endif
+ } else {
+#ifdef DEBUG_EXTEND_TRIMMING
+ fprintf(stderr, "No trimming done.\n");
+#endif
+ }
+
+#ifdef DEBUG_EXTEND
+ fprintf(stderr, "4M u %s . %s %d %d 1 %s %d %d 1\n",
+ matches[idx]->_matchId,
+ matches[idx]->_id1, matches[idx]->_acc1->getRangeBegin(), matches[idx]->_acc1->getRangeLength(),
+ matches[idx]->_id2, matches[idx]->_acc2->getRangeBegin(), matches[idx]->_acc2->getRangeLength());
+#endif
+
+#ifdef DEBUG_EXTEND
+ if ((m->pos1() > m->seq1()->sequenceLength()) || (m->pos2() > m->seq2()->sequenceLength()))
+ m->dump(stderr, "NEGATIVE after forward extend!", true), abort();
+
+ fprintf(stderr, "\n==============\n\n");
+#endif
+
+ ++idx;
+ }
+
+ return idx;
+}
+
+
+
diff --git a/atac-driver/matchExtender/matchExtender.C b/atac-driver/matchExtender/matchExtender.C
new file mode 100644
index 0000000..54e8c7c
--- /dev/null
+++ b/atac-driver/matchExtender/matchExtender.C
@@ -0,0 +1,214 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <vector>
+#include <algorithm>
+using namespace std;
+
+#include "atac.H"
+#include "match.H"
+#include "bio++.H"
+#include "seqCache.H"
+
+
+uint32 minEndRunLen = 10; // -E /matchExtenderMinEndRunLen
+uint32 maxMMBlock = 3; // -B /matchExtenderMaxMMBlock
+uint32 minBlockSep = 20; // -S /matchExtenderMinBlockSep
+double minIdentity = 0.95; // -I /matchExtenderMinIdentity
+uint32 maxNbrSep = 100; // -P /matchExtenderMaxNbrSep
+uint32 maxNbrPathMM = 5; // -D /matchExtenderMaxNbrPathMM
+
+
+bool trim_to_pct(vector<match_s *>& matches, uint32 midx, double pct);
+void extend_match_backward(vector<match_s *>& matches, uint32 midx, uint32 min_start_pos);
+bool can_reach_nearby_match(match_s *src, match_s *dest);
+bool extend_match_forward(vector<match_s *>& matches, uint32 midx, match_s *target);
+uint32 extend_matches_on_diagonal(vector<match_s *>& matches, uint32 diag_start);
+
+
+class MatchCompare {
+public:
+ int operator()(const match_s *m1, const match_s *m2) {
+ return(*m1 < *m2);
+ }
+};
+
+
+
+// Read matches until the iid differs. Leave the next match in inLine.
+//
+bool
+readMatches(atacFileStreamMerge &AF,
+ atacMatch *&m,
+ seqCache *C1,
+ seqCache *C2,
+ vector<match_s *> &fwdMatches,
+ vector<match_s *> &revMatches) {
+
+ fwdMatches.clear();
+ revMatches.clear();
+
+ // If M is null, we're here for the first time, so get the next
+ // (first) match from the file. M is also null if we're at the end
+ // of the file, so if after getting a match (that's done at the end
+ // of this routine) we're still null, we're all done.
+ //
+ if (m == 0L)
+ m = AF.nextMatch('x');
+ if (m == 0L)
+ return(false);
+
+ uint32 iid1 = m->iid1;
+ uint32 iid2 = m->iid2;
+
+ seqInCore *seq1 = C1->getSequenceInCore(iid1);
+ seqInCore *seq2 = C2->getSequenceInCore(iid2);
+
+ while (m) {
+ if ((m->iid1 == iid1) && (m->iid2 == iid2)) {
+
+ if (m->fwd1 == m->fwd2)
+ fwdMatches.push_back(new match_s(m->matchuid,
+ seq1, m->iid1, m->pos1, m->len1, m->fwd1,
+ seq2, m->iid2, m->pos2, m->len2, m->fwd2));
+ else
+ revMatches.push_back(new match_s(m->matchuid,
+ seq1, m->iid1, m->pos1, m->len1, m->fwd1,
+ seq2, m->iid2, m->pos2, m->len2, m->fwd2));
+ } else {
+ break;
+ }
+
+ m = AF.nextMatch('x');
+ }
+
+ if (fwdMatches.size() > 0)
+ sort(fwdMatches.begin(), fwdMatches.end(), MatchCompare());
+
+ if (revMatches.size() > 0)
+ sort(revMatches.begin(), revMatches.end(), MatchCompare());
+
+ return(true);
+}
+
+
+
+
+int
+main(int argc, char *argv[]) {
+ bool fail = false;
+
+ atacMatch *m = 0L;
+ atacFileStreamMerge AF;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-e") == 0) {
+ minEndRunLen = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-b") == 0) {
+ maxMMBlock = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-s") == 0) {
+ minBlockSep = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-i") == 0) {
+ minIdentity = atof(argv[++arg]);
+ } else if (strcmp(argv[arg], "-p") == 0) {
+ maxNbrSep = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-d") == 0) {
+ maxNbrPathMM = strtouint32(argv[++arg], 0L);
+ } else {
+ //fprintf(stderr, "unknown option %s\n", argv[arg]);
+ //fail = true;
+ AF.addFile(argv[arg]);
+ }
+
+ arg++;
+ }
+
+ if (fail) {
+ fprintf(stderr, "usage: %s [options] header.atac matches.atac ... > matches.atac\n", argv[0]);
+ fprintf(stderr, " -e <int> matchExtenderMinEndRunLen, 10\n");
+ fprintf(stderr, " -b <int> matchExtenderMaxMMBlock, 3\n");
+ fprintf(stderr, " -s <int> matchExtenderMinBlockSep, 20\n");
+ fprintf(stderr, " -i <float> matchExtenderMinIdentity, 0.95\n");
+ fprintf(stderr, " -p <int> matchExtenderMaxNbrSep, 100\n");
+ fprintf(stderr, " -d <int> matchExtenderMaxNbrPathMM, 5\n");
+ exit(1);
+ }
+
+ AF.writeHeader(stdout);
+
+ seqCache *C1 = new seqCache(AF.assemblyFileA(), 1, false);
+ seqCache *C2 = new seqCache(AF.assemblyFileB(), 1, false);
+
+ C1->loadAllSequences();
+
+ vector<match_s *> fwdMatches;
+ vector<match_s *> revMatches;
+
+ while (readMatches(AF, m, C1, C2, fwdMatches, revMatches)) {
+
+ uint32 diag_start = 0;
+ while (diag_start < fwdMatches.size()) {
+ //fprintf(stderr, "fwd: M u %s . %s %d %d 1 %s %d %d 1\n",
+ // fwdMatches[diag_start]->_matchId,
+ // fwdMatches[diag_start]->_id1, fwdMatches[diag_start]->_acc1->getRangeBegin(), fwdMatches[diag_start]->_acc1->getRangeLength(),
+ // fwdMatches[diag_start]->_id2, fwdMatches[diag_start]->_acc2->getRangeBegin(), fwdMatches[diag_start]->_acc2->getRangeLength());
+ diag_start = extend_matches_on_diagonal(fwdMatches, diag_start);
+ }
+
+ diag_start = 0;
+ while (diag_start < revMatches.size()) {
+ //fprintf(stderr, "rev: M u %s . %s %d %d 1 %s %d %d 1\n",
+ // revMatches[diag_start]->_matchId,
+ // revMatches[diag_start]->_id1, revMatches[diag_start]->_acc1->getRangeBegin(), revMatches[diag_start]->_acc1->getRangeLength(),
+ // revMatches[diag_start]->_id2, revMatches[diag_start]->_acc2->getRangeBegin(), revMatches[diag_start]->_acc2->getRangeLength());
+ diag_start = extend_matches_on_diagonal(revMatches, diag_start);
+ }
+
+
+ // Dump and destroy all the matches
+ //
+ for (uint32 i=0; i<fwdMatches.size(); i++) {
+ if (!fwdMatches[i]->isDeleted())
+ fprintf(stdout, "M u %s . %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" 1\n",
+ fwdMatches[i]->_matchId,
+ AF.labelA(), fwdMatches[i]->_iid1, fwdMatches[i]->_acc1->getRangeBegin(), fwdMatches[i]->_acc1->getRangeLength(),
+ AF.labelB(), fwdMatches[i]->_iid2, fwdMatches[i]->_acc2->getRangeBegin(), fwdMatches[i]->_acc2->getRangeLength());
+ delete fwdMatches[i];
+ }
+
+ for (uint32 i=0; i<revMatches.size(); i++) {
+ if (!revMatches[i]->isDeleted())
+ fprintf(stdout, "M u %s . %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" -1\n",
+ revMatches[i]->_matchId,
+ AF.labelA(), revMatches[i]->_iid1, revMatches[i]->_acc1->getRangeBegin(), revMatches[i]->_acc1->getRangeLength(),
+ AF.labelB(), revMatches[i]->_iid2, revMatches[i]->_acc2->getRangeBegin(), revMatches[i]->_acc2->getRangeLength());
+ delete revMatches[i];
+ }
+
+ fwdMatches.clear();
+ revMatches.clear();
+ }
+
+ delete C1;
+ delete C2;
+}
diff --git a/atac-driver/mismatchCounter/Make.include b/atac-driver/mismatchCounter/Make.include
new file mode 100644
index 0000000..54c4117
--- /dev/null
+++ b/atac-driver/mismatchCounter/Make.include
@@ -0,0 +1,16 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBSEQ/ :=$(realpath $/../../libseq/)/
+LIBATAC/ :=$(realpath $/../libatac/)/
+
+$/.CXX_EXES := $/mismatchCounter
+$/.CXX_SRCS := $/mismatchCounter.C
+
+$/.CLEAN :=$/*.o $/*~ $/core
+
+$/mismatchCounter: $/mismatchCounter.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBATAC/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/atac-driver/mismatchCounter/mismatchCounter.C b/atac-driver/mismatchCounter/mismatchCounter.C
new file mode 100644
index 0000000..be9984b
--- /dev/null
+++ b/atac-driver/mismatchCounter/mismatchCounter.C
@@ -0,0 +1,208 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "bio++.H"
+#include "seqCache.H"
+#include "atac.H"
+
+#define ANNOTATE
+#define EXTRAMATCHES
+
+// Generates a histogram of the exact match block sizes
+// Counts to global number of mismatches
+// Annotates each match with the number of mismatches
+// Checks for identities outside matches
+
+
+void
+updateExactBlockHistogram(uint32 *blockHistogram, uint32 blockMatches) {
+
+ if (blockMatches > 8 * 1024 * 1024)
+ blockHistogram[0]++;
+ else
+ blockHistogram[blockMatches]++;
+}
+
+
+
+int
+main(int argc, char *argv[]) {
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-h") == 0) {
+ // Generate a histogram of exact-match lengths
+ } else if (strcmp(argv[arg], "-a") == 0) {
+ // Annotate each match with the percent error, compute
+ // the global percent error.
+ } else if (strcmp(argv[arg], "-e") == 0) {
+ // Generate a histogram of the percent error in each match
+ } else if (strcmp(argv[arg], "-c") == 0) {
+ // Check the edges of each match to ensure there isn't a match
+ } else {
+ fprintf(stderr, "usage: %s [-h exact-match-histogram] [-a] [-e error-histogram] [-c]\n", argv[0]);
+ fprintf(stderr, " -h: histogram of the length of the exact match blocks\n");
+ fprintf(stderr, " -a: annotate each match with the percent error, write to stdout\n");
+ fprintf(stderr, " -e: histogram of the error rate of each match\n");
+ fprintf(stderr, " -c: check that the next base on each side is a mismatch\n");
+ exit(1);
+ }
+ arg++;
+ }
+
+ uint32 globalSequence = 0;
+ uint32 globalMismatches = 0;
+ uint32 blockMatches = 0;
+ uint32 *blockHistogram = new uint32 [8 * 1024 * 1024];
+
+ for (uint32 x=0; x<8*1024*1024; x++)
+ blockHistogram[x] = 0;
+
+ atacFile AF("-");
+ atacMatchList &ML = *AF.matches();
+
+ seqCache *C1 = new seqCache(AF.assemblyFileA(), 1, false);
+ seqCache *C2 = new seqCache(AF.assemblyFileA(), 1, false);
+
+ for (uint32 mi=0; mi<ML.numberOfMatches(); mi++) {
+ atacMatch *m = ML.getMatch(mi);
+
+ seqInCore *S1 = C1->getSequenceInCore(m->iid1);
+ seqInCore *S2 = C2->getSequenceInCore(m->iid2);
+
+ FastAAccessor A1(S1, false);
+ FastAAccessor A2(S2, (m->fwd1 != m->fwd2));
+
+ A1.setRange(m->pos1, m->len1);
+ A2.setRange(m->pos2, m->len2);
+
+ uint32 localMismatches = 0;
+
+#ifdef EXTRAMATCHES
+ uint32 extraMatchesL = 0;
+ uint32 extraMatchesR = 0;
+
+ // Check for matches on either side of the region.
+
+ A1.setPosition(m->pos1);
+ A2.setPosition(m->pos2);
+ --A1;
+ --A2;
+ while (A1.isValid() &&
+ A2.isValid() &&
+ (letterToBits[(int)*A1] != 0xff)&&
+ (letterToBits[(int)*A2] != 0xff) &&
+ IUPACidentity[(int)*A1][(int)*A2]) {
+ extraMatchesL++;
+ --A1;
+ --A2;
+ }
+
+ A1.setPosition(m->pos1 + m->len1 - 1);
+ A2.setPosition(m->pos2 + m->len2 - 1);
+ ++A1;
+ ++A2;
+ while (A1.isValid() &&
+ A2.isValid() &&
+ (letterToBits[(int)*A1] != 0xff)&&
+ (letterToBits[(int)*A2] != 0xff) &&
+ IUPACidentity[(int)*A1][(int)*A2]) {
+ extraMatchesR++;
+ ++A1;
+ ++A2;
+ }
+
+ // WARN if we found extra identities
+
+#if 0
+ if (extraMatchesL + extraMatchesR > 0) {
+ A1.setPosition(m->pos1);
+ A2.setPosition(m->pos2);
+
+ chomp(inLine);
+ fprintf(stderr, "WARNING: found "uint32FMT" extra matches to the left and "uint32FMT" extra matches to the right in %s\n",
+ extraMatchesL, extraMatchesR, inLine);
+
+#if 0
+ for (uint32 ii=0; ii<m->len1; ii++, ++A1)
+ fprintf(stdout, "%c", *A1);
+ fprintf(stdout, "\n");
+
+ for (uint32 ii=0; ii<m->len1; ii++, ++A2)
+ fprintf(stdout, "%c", *A2);
+ fprintf(stdout, "\n");
+#endif
+ }
+#endif
+
+#endif // EXTRAMATCHES
+
+
+ A1.setPosition(m->pos1);
+ A2.setPosition(m->pos2);
+ for (uint32 ii=0; ii<m->len1; ii++, ++A1, ++A2) {
+
+ // Count global matches / mismatches
+ //
+ globalSequence++;
+ if (!((letterToBits[(int)*A1] != 0xff) &&
+ (letterToBits[(int)*A2] != 0xff) &&
+ IUPACidentity[(int)*A1][(int)*A2])) {
+ globalMismatches++;
+ localMismatches++;
+ }
+
+ // Histogram of exact match block lengths
+ //
+ if ((letterToBits[(int)*A1] != 0xff) &&
+ (letterToBits[(int)*A2] != 0xff) &&
+ IUPACidentity[(int)*A1][(int)*A2]) {
+ blockMatches++;
+ } else {
+ updateExactBlockHistogram(blockHistogram, blockMatches);
+ blockMatches = 0;
+ }
+ }
+
+ // Finish off stuff
+ //
+ updateExactBlockHistogram(blockHistogram, blockMatches);
+ blockMatches = 0;
+
+ // If annotate, emit a new record.
+ }
+
+
+ // Report stuff
+ //
+ fprintf(stderr, "globalSequence = "uint32FMT"\n", globalSequence);
+ fprintf(stderr, "globalMismatches = "uint32FMT"\n", globalMismatches);
+
+#if 0
+ FILE *O = fopen("MismatchCounter.block.histogram.out", "w");
+ for (uint32 i=0; i<8 * 1024 * 1024; i++)
+ fprintf(O, uint32FMT" "uint32FMT"\n", i, blockHistogram[i]);
+ fclose(O);
+#endif
+
+ return(0);
+}
diff --git a/atac-driver/relabel.pl b/atac-driver/relabel.pl
new file mode 100644
index 0000000..d95238e
--- /dev/null
+++ b/atac-driver/relabel.pl
@@ -0,0 +1,125 @@
+#!/usr/bin/perl
+
+# Reads an atac file, relabels the sequence names (e.g., WGSA:4) with
+# the defline ID's.
+
+use strict;
+
+sub readDeflines ($) {
+ my $file = $_[0];
+ if ($file =~ m/^(.*).fasta/) {
+ $file = $1;
+ }
+
+ if (-e "$file.deflines") {
+ $file = "$file.deflines";
+ } elsif (-e "$file.fasta.deflines") {
+ $file = "$file.fasta.deflines";
+ } else {
+ print STDERR "Dang, gotta grep the deflines!\n";
+ system("grep '>' $file.fasta > $file.deflines");
+ $file = "$file.deflines";
+ }
+
+ my @nameA;
+
+ #print STDERR "$file\n";
+
+ open(Z, "< $file") or die "Failed to open '$file'\n";
+ while (!eof(Z)) {
+ my $n = <Z>;
+ if ($n =~ m/^\>\s*(\S+)\s*/) {
+ push @nameA, $1;
+ } else {
+ chomp $n;
+ print STDERR "Failed to match defline '$n'\n";
+ }
+ }
+ close(Z);
+
+ return(@nameA);
+}
+
+
+my $file = shift @ARGV;
+my @nameA;
+my @nameB;
+
+if ($file eq "-A") {
+ @nameA = readDeflines(shift @ARGV);
+ $file = shift @ARGV;
+}
+if ($file eq "-B") {
+ @nameB = readDeflines(shift @ARGV);
+ $file = shift @ARGV;
+}
+
+open(F, "< $file") or die "Failed to open '$file' for input\n";
+open(G, "> $file.uids") or die "Failed to open '$file.uids' for output\n";
+
+while (<F>) {
+ if (m/assemblyFile1=(.*)$/) {
+ @nameA = readDeflines($1);
+ print STDERR "num nameA = ", scalar(@nameA), "\n";
+ }
+ if (m/assemblyFile2=(.*)$/) {
+ @nameB = readDeflines($1);
+ print STDERR "num nameB = ", scalar(@nameB), "\n";
+ }
+
+ if (m/^M\s/) {
+ my @v = split '\s+', $_;
+
+ if ($v[4] =~ m/^\w+:(\d+)$/) {
+ if (defined($nameA[$1])) {
+ $v[4] = $nameA[$1];
+ } else {
+ die "Didn't find nameA for $1\n";
+ }
+ } else {
+ die "Didn't match v[4] = $v[4]\n";
+ }
+
+ if ($v[8] =~ m/^\w+:(\d+)$/) {
+ if (defined($nameB[$1])) {
+ $v[8] = $nameB[$1];
+ } else {
+ die "Didn't find nameA for $1\n";
+ }
+ } else {
+ die "Didn't match v[8] = $v[8]\n";
+ }
+
+ # Special case stuff....
+ #
+ if ($v[4] =~ m/^Chr(\d+)$/) {
+ $v[4] = "mchr$1";
+ } elsif ($v[4] =~ m/^Chr(\d+)_random$/) {
+ $v[4] = "mchr${1}r";
+ } elsif ($v[4] =~ m/^SCAFFOLD(\d+)$/) {
+ $v[4] = "bscf$1";
+ } elsif ($v[4] =~ m/^Contig(\d+)$/) {
+ $v[4] = "wscf$1";
+ } elsif ($v[4] =~ m/^chr(\d+)$/) {
+ $v[4] = "hchr$1";
+ }
+
+ if ($v[8] =~ m/^SCAFFOLD(\d+)$/) {
+ $v[8] = "bscf$1";
+ } elsif ($v[8] =~ m/^Contig(\d+)$/) {
+ $v[8] = "wscf$1";
+ } elsif ($v[8] =~ m/^chr(\d+)$/) {
+ $v[8] = "hchr$1";
+ }
+
+
+
+ my $line = join " ", @v;
+ print G "$line\n";
+ } else {
+ print G $_;
+ }
+}
+
+close(G);
+close(F);
diff --git a/atac-driver/rewriteUIDs.pl b/atac-driver/rewriteUIDs.pl
new file mode 100644
index 0000000..546170f
--- /dev/null
+++ b/atac-driver/rewriteUIDs.pl
@@ -0,0 +1,81 @@
+#!/usr/bin/perl
+
+# Reads an atac file with atac-format IDs, writes an atac file with
+# UIDs (the first word in the defline). This is the last step in the
+# normal atac pipeline.
+
+use strict;
+
+my $atacfile = shift @ARGV;
+
+my $seqA;
+my $tagA;
+my %uidA;
+
+my $seqB;
+my $tagB;
+my %uidB;
+
+my $iid;
+
+open(F, "< $atacfile") or die;
+while (!defined($seqA) || !defined($tagA) || !defined($seqB) || !defined($tagB)) {
+ $_ = <F>;
+ $seqA = $1 if (m/^\/assemblyFile1=(.*)$/);
+ $tagA = $1 if (m/^\/assemblyId1=(.*)$/);
+ $seqB = $1 if (m/^\/assemblyFile2=(.*)$/);
+ $tagB = $1 if (m/^\/assemblyId2=(.*)$/);
+}
+close(F);
+
+if (!defined($seqA) || !defined($tagA) || !defined($seqB) || !defined($tagB)) {
+ die "Something fishy. Didn't find seqs or tags in '$atacfile'.\n";
+}
+
+$iid = 0;
+open(F, "< $seqA") or die "Failed to open '$seqA'\n";
+while (<F>) {
+ if (m/^>(\S+)\s*.*$/) {
+ #chomp;
+ #print STDERR "$tagA:$iid -> $_\n";
+ $uidA{"$tagA:$iid"} = $1;
+ $iid++;
+ }
+}
+close(F);
+
+
+$iid = 0;
+open(F, "< $seqB") or die "Failed to open '$seqA'\n";
+while (<F>) {
+ if (m/^>(\S+)\s*.*$/) {
+ #chomp;
+ #print STDERR "$tagB:$iid -> $_\n";
+ $uidB{"$tagB:$iid"} = $1;
+ $iid++;
+ }
+}
+close(F);
+
+
+$, = " ";
+$\ = "\n";
+
+open(F, "< $atacfile") or die;
+while (<F>) {
+ chomp $_;
+
+ my @v = split '\s+', $_;
+
+ if (m/^M/) {
+ die "Didn't find uidA for $v[4]\n" if (!defined($uidA{$v[4]}));
+ die "Didn't find uidB for $v[8]\n" if (!defined($uidB{$v[8]}));
+
+ $v[4] = $uidA{$v[4]};
+ $v[8] = $uidB{$v[8]};
+ print @v;
+ } else {
+ print $_;
+ }
+}
+close(F);
diff --git a/atac-driver/run-comparison.pl b/atac-driver/run-comparison.pl
new file mode 100644
index 0000000..6d98b85
--- /dev/null
+++ b/atac-driver/run-comparison.pl
@@ -0,0 +1,242 @@
+#!/usr/bin/perl
+
+# Runs the full assembly-to-assembly mapping comparison.
+#
+# Takes two ATAC-format mapping files and:
+# shifts 1bp gaps to the 3' end
+# compute the assembly annotation
+#
+# Generate statistics:
+# sum and histogram of the different annotations
+# sum and histogram of the disagreement
+#
+# number of matches in each
+# number of scaffolds mapped to multiple chromosomes
+#
+# Nx (0 <= x <= 100)
+#
+# histogram of match length
+# histogram of run length
+# histogram of clump length
+#
+
+my $bin = "/bioinfo/assembly/walenz/hummap2/src/genomics/atac-driver";
+my $overlap = "$bin/alignOverlap/overlap";
+my $gapShifter = "$bin/gapShifter/gapShifter";
+my $mismatchCounter = "$bin/mismatchCounter/mismatchCounter";
+
+if (scalar(@ARGV) != 2) {
+ print STDERR "usage: $0 <amap-mapping> <atac-mapping>\n";
+ exit(1);
+}
+
+my $amap = shift @ARGV;
+my $atac = shift @ARGV;
+my $dir = "COMPARE";
+my $tmp = "COMPARE"; # /tmp on assembly-a is pathetic
+
+# If we're given relative paths, make them absolute
+#
+my $pwd = `pwd`; chomp $pwd;
+
+$amap = "$pwd/$amap" if ($amap !~ m!^/!);
+$atac = "$pwd/$atac" if ($atac !~ m!^/!);
+
+
+# We run stuff in the $dir directory, set that up
+#
+system("mkdir $dir") if (! -d "$dir");
+system("ln -s $amap $dir/amap.atac") if (! -e "$dir/amap.atac");
+system("ln -s $atac $dir/atac.atac") if (! -e "$dir/atac.atac");
+
+
+if (! -e "$dir/amap.shift.atac") {
+ print STDERR "GAP SHIFTER NEEDS assembly-a, so that the whole genome can be loaded!\n";
+ system("$gapShifter -g 1 < $dir/amap.atac > $dir/amap.shift.atac") and die "Failed to shift amap.\n";
+}
+
+if (! -e "$dir/atac.shift.atac") {
+ print STDERR "GAP SHIFTER NEEDS assembly-a, so that the whole genome can be loaded!\n";
+ system("$gapShifter -g 1 < $dir/atac.atac > $dir/atac.shift.atac") and die "Failed to shift atac.\n";
+}
+
+
+# Run overlap
+#
+if (! -e "$dir/overlap.map1annotation") {
+ system("$overlap $dir/amap.shift.atac $dir/atac.shift.atac $dir/overlap") and die "Failed to overlap.\n";
+}
+
+
+# Reads the annotation output of overlap.C and adds percent identity
+# for each match.
+#
+# This is done by first converting the overlap.C output into atac
+# format (one file for each mapping), running mismatchCounter on each
+# of those files, then merging the results together.
+#
+if (! -e "overlap.map1annotation.identity") {
+ if ((-e "$tmp/c.otoi") && (-e "$tmp/d.otoi")) {
+ print STDERR "Using $tmp/c.otoi and $tmp/d.otoi\n";
+ } else {
+ overlapToAtac("$dir/overlap.map1annotation", "$tmp/a.otoi", "$tmp/b.otoi");
+
+ print STDERR "Counting mismatches.\n";
+
+ system("$mismatchCounter < $tmp/a.otoi > $tmp/c.otoi") and die "Failed mismatchCounter on A.\n";
+ system("$mismatchCounter < $tmp/b.otoi > $tmp/d.otoi") and die "Failed mismatchCounter on B.\n";
+ }
+
+ open(A, "< $tmp/c.otoi") or die "Failed to open $tmp/c.otoi\n";
+ open(B, "< $tmp/d.otoi") or die "Failed to open $tmp/d.otoi\n";
+ open(M, "< $dir/overlap.map1annotation");
+ open(O, "> $dir/overlap.map1annotation.identity");
+
+ my $a;
+ my $b;
+ my $m;
+
+ print STDERR "Merging results.\n";
+
+ while (!eof(A) && !eof(B) && !eof(M)) {
+ $m = <M>; chomp $m;
+
+ # Skip any ATAC headers in A and B
+ do { $a = <A>; chomp $a; } while ($a =~ m/^\//);
+ do { $b = <B>; chomp $b; } while ($b =~ m/^\//);
+
+ my @av = split '\s+', $a;
+ my @bv = split '\s+', $a;
+
+ my ($aid, $abeg, $alen, $aori, $amis, $aident) = (undef, undef, undef, undef, undef, "0.0");
+ my ($bid, $bbeg, $blen, $bori, $bmis, $bident) = (undef, undef, undef, undef, undef, "0.0");
+
+ if ($a =~ m/HUREF:(\d+)\s+(\d+)\s+(\d+)\s+(-*\d+)\s+>\s+\/mismatches=(\d+)\s+\/identity=(\d+\.\d+)/) {
+ $aid = $1;
+ $abeg = $2;
+ $alen = $3;
+ $aori = $4;
+ $amis = $5;
+ $aident = $6;
+ } elsif ($a !~ m/^M\sm\s/) {
+ print "Anope $a\n";
+ }
+
+ if ($b =~ m/HUREF:(\d+)\s+(\d+)\s+(\d+)\s+(-*\d+)\s+>\s+\/mismatches=(\d+)\s+\/identity=(\d+\.\d+)/) {
+ $bid = $1;
+ $bbeg = $2;
+ $blen = $3;
+ $bori = $4;
+ $bmis = $5;
+ $bident = $6;
+ } elsif ($b !~ m/^M\sm\s/) {
+ print "Bnope $b\n";
+ }
+
+ $aident = substr(" $aident", -7);
+ $bident = substr(" $bident", -7);
+
+ if ($m =~ m/^(.*\]\s+\d+\s+\(.*\))\s+(\d+\s+\(.*\))/) {
+ print O "$1 $aident $2 $bident\n";
+ } else {
+ print "Mnope $m\n";
+ exit(1);
+ }
+ }
+}
+
+
+
+
+# Reads the annotation output of overlap.C and writes two atac format
+# files, one for map1, one for map2. There is a 1-1 map between
+# lines, unmapped B35 (either because B35 was unmapped by both
+# mappings, or unmapped by the other mapping) are noted in 'm'
+# matches ("M m UID . B35LC:xxxxx beg len 1")
+#
+sub overlapToAtac {
+ my $infile = shift @_;
+ my $outfile1 = shift @_;
+ my $outfile2 = shift @_;
+
+ open(I, "< $infile") or die "Can't open '$infile' for reading.\n";
+ open(O1, "> $outfile1") or die "Can't open '$outfile1' for writing.\n";
+ open(O2, "> $outfile2") or die "Can't open '$outfile2' for writing.\n";
+
+ print STDERR "Converting $infile -> $outfile1 and $outfile2\n";
+
+ print O1 "/assemblyFile1=MERYL/B35LC.fasta\n";
+ print O1 "/assemblyFile2=MERYL/HUREF2.fasta\n";
+ print O1 "/assemblyId1=B35LC\n";
+ print O1 "/assemblyId2=HUREF2\n";
+
+ print O2 "/assemblyFile1=MERYL/B35LC.fasta\n";
+ print O2 "/assemblyFile2=MERYL/HUREF2.fasta\n";
+ print O2 "/assemblyId1=B35LC\n";
+ print O2 "/assemblyId2=HUREF2\n";
+
+ my $id = 0;
+
+ while (<I>) {
+ if (m/^.\s+(\d+):(\d+)-(\d+)\[\s*\d+\]\s(\d+)\s\(\s*(\d+):\s*(\d+)-\s*(\d+)\)\s(\d+)\s\(\s*(\d+):\s*(\d+)-\s*(\d+)\)\s*$/) {
+ my $id1 = $1;
+ my $b1 = $2;
+ my $e1 = $3;
+ my $l1 = $e1 - $b1;
+
+ my $mid2a = $4;
+ my $id2a = $5;
+ my $b2a = $6;
+ my $e2a = $7;
+ my $l2a = $e2a - $b2a;
+ my $oria = 1;
+
+ my $mid2b = $8;
+ my $id2b = $9;
+ my $b2b = $10;
+ my $e2b = $11;
+ my $l2b = $e2b - $b2b;
+ my $orib = 1;
+
+ $b1 =~ s/^0+//;
+ $e1 =~ s/^0+//;
+
+ $b1 = 0 if ($b1 == 0); # fix for blowing away all the zeros
+
+ if ($e2a < $b2a) {
+ ($b2a, $e2a) = ($e2a, $b2a);
+ $l2a = $e2a - $b2a;
+ $oria = -1
+ }
+ if ($e2b < $b2b) {
+ ($b2b, $e2b) = ($e2b, $b2b);
+ $l2b = $e2b - $b2b;
+ $orib = -1
+ }
+
+ $mid2a =~ s/^0+//;
+ $mid2b =~ s/^0+//;
+
+ if ($e2a > 0) {
+ print O1 "M u $id . B35LC:$id1 $b1 $l1 1 HUREF:$id2a $b2a $l2a $oria\n";
+ } else {
+ print O1 "M m $id . B35LC:$id1 $b1 $l1 1\n";
+ }
+
+ if ($e2b > 0) {
+ print O2 "M u $id . B35LC:$id1 $b1 $l1 1 HUREF:$id2b $b2b $l2b $orib\n";
+ } else {
+ print O2 "M m $id . B35LC:$id1 $b1 $l1 1\n";
+ }
+
+ $id++;
+ } else {
+ #print "Nope.\n";
+ #exit(1);
+ }
+ }
+
+ close(O1);
+ close(O2);
+ close(I);
+}
diff --git a/atac-driver/run-length-histogram.pl b/atac-driver/run-length-histogram.pl
new file mode 100644
index 0000000..a321ccb
--- /dev/null
+++ b/atac-driver/run-length-histogram.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/perl
+
+# Reads a list of numbers on stdin, computes a (blocked) histogram.
+#
+# If there are two numbers per line, they are assumed to be
+# a begin-end pair.
+
+# grep "M u " ATAC/atac.shift.atac | cut -d' ' -f 7 | perl run-length-histogram.pl > atac.histogram
+# grep "M u " ATAC/box2.shift.atac | cut -d' ' -f 7 | perl run-length-histogram.pl > box2.histogram
+
+my @histogram;
+my $blocksize = 1000;
+
+while (<STDIN>) {
+ s/^\s+//;
+ s/\s$//;
+ s/\s+/ /;
+
+ my @vals = split '\s+', $_;
+ my $val;
+
+ if (scalar(@vals) == 1) {
+ $val = $vals[0];
+ } elsif (scalar(@vals) == 1) {
+ $val = $vals[1] - $vals[0];
+ $val = $vals[0] - $vals[1] if ($val < 0);
+ } else {
+ }
+
+ $val = ($val / $blocksize);
+ $histogram[$val]++;
+}
+
+my $max = scalar(@histogram) + 1;
+for (my $i=0; $i<$max; $i++) {
+ $histogram[$i] = 0 if ($histogram[$i] == 0);
+ print "$i $histogram[$i]\n";
+}
+
diff --git a/atac-driver/run-length-n50.pl b/atac-driver/run-length-n50.pl
new file mode 100644
index 0000000..dde089a
--- /dev/null
+++ b/atac-driver/run-length-n50.pl
@@ -0,0 +1,52 @@
+#!/usr/bin/perl
+
+# Reads a list of numbers on stdin, computes the n50.
+#
+# If there are two numbers per line, they are assumed to be
+# a begin-end pair.
+
+# grep "M u " ATAC/atac.shift.atac | cut -d' ' -f 7 | perl n50.pl 3076782067
+# grep "M u " ATAC/box2.shift.atac | cut -d' ' -f 7 | perl n50.pl 3076782067
+
+my @values;
+
+while (<STDIN>) {
+ s/^\s+//;
+ s/\s$//;
+ s/\s+/ /;
+
+ my @vals = split '\s+', $_;
+ my $val;
+
+ if (scalar(@vals) == 1) {
+ $val = $vals[0];
+ } elsif (scalar(@vals) == 1) {
+ $val = $vals[1] - $vals[0];
+ $val = $vals[0] - $vals[1] if ($val < 0);
+ } else {
+ }
+
+ push @values, $val;
+}
+
+if (scalar(@ARGV) > 0) {
+ $totalLength = int($ARGV[0]);
+} else {
+ foreach my $v (@values) {
+ $totalLength += $v;
+ }
+}
+
+ at values = sort { $a <=> $b } @values;
+
+for (my $nvalue = 1; $nvalue <= 100; $nvalue += 1) {
+ my $limit = $nvalue * $totalLength / 100;
+ my $iter = 0;
+ my $sum = 0;
+
+ while (($sum < $limit) && ($iter < scalar(@values))) {
+ $sum += $values[$iter++];
+ }
+
+ print STDOUT "$nvalue $limit : $values[$iter-1]\n";
+}
diff --git a/atac-driver/runatac.pl b/atac-driver/runatac.pl
new file mode 100644
index 0000000..e619ca8
--- /dev/null
+++ b/atac-driver/runatac.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/perl
+
+my $genomeDir = "/bioinfo/assembly/walenz/GENOMES";
+my $id1;
+my $id2;
+
+while (scalar(@ARGV) > 0) {
+ my $arg = shift @ARGV;
+
+ if ($arg =~ m/^-g/) {
+ $genomeDir = shift @ARGV;
+ } elsif ($arg =~ m/^-1/) {
+ $id1 = shift @ARGV;
+ } elsif ($arg =~ m/^-2/) {
+ $id2 = shift @ARGV;
+ }
+
+}
+
+die "usage: $0 [-genomedir path] -1 id1 -2 id2\n" if (!defined($id1) || !defined($id2));
+
+die "No bin dir?\n" if (! -e "/bioinfo/assembly/walenz/src/genomics/linux64/bin");
+die "No bin?\n" if (! -x "/bioinfo/assembly/walenz/src/genomics/linux64/bin/snapper2");
+
+$genomeDir = "$ENV{'PWD'}/$genomeDir" if ($genomeDir !~ m!^/!);
+
+my $name = "${id1}vs${id2}";
+my $cmd;
+
+$cmd .= "perl /bioinfo/assembly/walenz/src/genomics/atac-driver/briatac.pl ";
+$cmd .= " -dir $name ";
+$cmd .= " -id1 $id1 -id2 $id2 ";
+$cmd .= " -genomedir $genomeDir ";
+$cmd .= " -meryldir $genomeDir ";
+$cmd .= " -bindir /bioinfo/assembly/walenz/src/genomics/linux64/bin ";
+$cmd .= " -merylthreads 4 ";
+$cmd .= " -numsegments 2 ";
+$cmd .= " -numthreads 4 ";
+$cmd .= " -samespecies";
+print "$cmd\n";
+system($cmd) and die "Failed to briatac.pl!\n";
+
+
+$cmd = "cd $name && ln -s $name.k20.u1.f20.g0.matches.sorted.extended $name.atac && ";
+$cmd .= "time sh /bioinfo/assembly/walenz/src/genomics/atac/atacdriver.sh $name.atac && ";
+$cmd .= "grep ^M $name.atac.ckpLast | cut -d' ' -f 1-12 | sort -k5,5 -k6n > $name.atac.ckpLast.sorted && ";
+$cmd .= "/bioinfo/assembly/walenz/src/genomics/atac-driver/clumpMaker/clumpMaker -c 5000 -2 -S -f $name.atac.ckpLast.sorted > $name.atac.ckpLast.clumps";
+system($cmd) and die "Failed to atacdriver.sh!\n";
+
+
diff --git a/atac-driver/statsGenerator/Make.include b/atac-driver/statsGenerator/Make.include
new file mode 100644
index 0000000..3abe5f2
--- /dev/null
+++ b/atac-driver/statsGenerator/Make.include
@@ -0,0 +1,16 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBSEQ/ :=$(realpath $/../../libseq/)/
+LIBATAC/ :=$(realpath $/../libatac/)/
+
+$/.CXX_EXES := $/statsGenerator
+$/.CXX_SRCS := $/statsGenerator.C
+
+$/.CLEAN :=$/*.o $/*~ $/core
+
+$/statsGenerator: $/statsGenerator.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBATAC/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/atac-driver/statsGenerator/stats-to-xls.pl b/atac-driver/statsGenerator/stats-to-xls.pl
new file mode 100644
index 0000000..9631488
--- /dev/null
+++ b/atac-driver/statsGenerator/stats-to-xls.pl
@@ -0,0 +1,551 @@
+#!/usr/bin/perl
+
+# Convers the output of statsGenerator into a nice Excel spreadsheet.
+
+use strict;
+use lib "/home/bwalenz/linux/lib/perl5/site_perl/5.8.0";
+use Spreadsheet::WriteExcel;
+use Spreadsheet::WriteExcel::Big;
+
+if (scalar(@ARGV) != 1) {
+ die "usage: $0 stats-prefix\n";
+}
+
+my $prefix = shift @ARGV;
+
+################################################################################
+#
+# First, suck in the big ugly stdout from statsGenerator.
+#
+if (! -e "$prefix.out") {
+ die "I looked for the stdout from statsGenerator in '$prefix.out', but didn't find it.\n";
+}
+
+my $workbook = Spreadsheet::WriteExcel::Big->new("$prefix.xls");
+my $summary = $workbook->add_worksheet("Summary");
+
+my $format = $workbook->add_format();
+$format->set_size(10);
+$format->set_color('black');
+$format->set_num_format(1);
+
+my $formatFP = $workbook->add_format();
+$format->set_size(10);
+$format->set_color('black');
+
+my $format_heading = $workbook->add_format();
+$format_heading->set_size(10);
+$format_heading->set_bold();
+$format_heading->set_color('black');
+$format_heading->set_num_format(1);
+
+my $format_label = $workbook->add_format();
+$format_heading->set_size(10);
+$format_heading->set_bold();
+$format_heading->set_color('black');
+$format_heading->set_num_format(1);
+
+my $format_comment = $workbook->add_format();
+$format_heading->set_size(10);
+$format_heading->set_bold();
+$format_heading->set_color('black');
+$format_heading->set_num_format(1);
+
+$summary->set_column(0, 2, 20);
+$summary->set_column(3, 3, 30);
+
+my %stats; # scratch space
+
+open(F, "< $prefix.out");
+while (!eof(F)) {
+ $_ = <F>;
+
+ if (m/^\s*$/) {
+ # Nop;
+ } if (m/^SEQUENCE$/) {
+ $summary->write(1, 0, "Input Sequences", $format_heading);
+ $summary->write(2, 0, "totalLength", $format_label);
+ $summary->write(2, 3, "all letters, including N", $format_comment);
+ $summary->write(3, 0, "totalLength", $format_label);
+ $summary->write(3, 3, "ACGT only", $format_comment);
+
+ $_ = <F>;
+ if (m/totalLength\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+#\s+all\s+letters/) {
+ $summary->write(0, 1, $1, $format_heading);
+ $summary->write(0, 2, $3, $format_heading);
+
+ # remember which column is for which assembly
+ $stats{$1} = 1;
+ $stats{$3} = 2;
+
+ # and which assembly is in which column
+ $stats{1} = $1;
+ $stats{2} = $3;
+
+ $summary->write(2, 1, "$2", $format);
+ $summary->write(2, 2, "$4", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+
+ $_ = <F>;
+ if (m/totalLength\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+#\s+ACGT/) {
+ $summary->write(3, 1, "$2", $format);
+ $summary->write(3, 2, "$4", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+ } if (m/^TANDEM REPEATS in (.*)$/) {
+ my $asm = $1;
+
+ $summary->write(5, 0, "Tandem Repeats", $format_heading);
+ $summary->write(6, 0, "number", $format_label);
+ $summary->write(7, 0, "totalLength", $format_label);
+ $summary->write(8, 0, "coveredLength", $format_label);
+
+ $_ = <F>;
+ if (m/numberOfItems\s+(\d+)/) {
+ $summary->write(6, $stats{$asm}, "$1", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+
+ $_ = <F>;
+ if (m/totalLength\s+(\d+)\s+#\s+sum\s+of\s+lengths/) {
+ $summary->write(7, $stats{$asm}, "$1", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+
+ $_ = <F>;
+ if (m/coveredLength\s+(\d+)\s+#\s+sequence\s+covered/) {
+ $summary->write(8, $stats{$asm}, "$1", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+ } if (m/^MATCHES IN RUNS$/) {
+ $summary->write(10, 0, "Matches in Runs", $format_heading);
+ $summary->write(11, 0, "runMissingFull", $format_label);
+ $summary->write(11, 3, "covered by a run, not by a match, including N", $format_comment);
+ $summary->write(12, 0, "runMissingACGT", $format_label);
+ $summary->write(12, 3, "covered by a run, not by a match, ACGT only", $format_comment);
+
+ $_ = <F>;
+ if (m/runMissingFull\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+#\s+sequence\s+in\s+run,\s+not\s+covered,\s+including\s+N/) {
+ $summary->write(11, 1, "$2", $format);
+ $summary->write(11, 2, "$4", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+
+ $_ = <F>;
+ if (m/runMissingFull\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+#\s+sequence\s+in\s+run,\s+not\s+covered,\s+ACGT\s+only/) {
+ $summary->write(12, 1, "$2", $format);
+ $summary->write(12, 2, "$4", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+ } if ((m/^MATCHES$/) || (m/^RUNS$/)) {
+ my $begin;
+ my $chrcov;
+
+ if (m/^MATCHES$/) {
+ $begin = 14;
+ $summary->write($begin, 0, "Matches", $format_heading);
+ $chrcov = $workbook->add_worksheet("Chr Cov Match");
+ } else {
+ $begin = 26;
+ $summary->write($begin, 0, "Runs", $format_heading);
+ $chrcov = $workbook->add_worksheet("Chr Cov Run");
+ }
+
+ $chrcov->set_column(0, 0, 10);
+ $chrcov->set_column(1, 6, 20);
+
+ $summary->write($begin+1, 0, "number", $format_label);
+ $summary->write($begin+2, 0, "totalLength", $format_label);
+ $summary->write($begin+3, 1, "histogram", $format_label);
+ $summary->write($begin+4, 2, "histogram", $format_label);
+
+ $summary->write($begin+5, 0, "coveredLengthFull", $format_label);
+ $summary->write($begin+6, 0, "coveredLengthACGT", $format_label);
+ $summary->write($begin+7, 0, "coveredLengthNonACGT", $format_label);
+ $summary->write($begin+8, 1, "histogram", $format_label);
+ $summary->write($begin+9, 2, "histogram", $format_label);
+
+ $_ = <F>;
+ if (m/numberOfItems\s+(\d+)/) {
+ $summary->write($begin+1, 1, "$1", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+
+ $_ = <F>;
+ if (m/matchLength\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s#\sSum\s+of\s+lengths/) {
+ $summary->write($begin+2, 1, "$2", $format);
+ $summary->write($begin+2, 2, "$4", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+
+ histogram($begin+3);
+ histogram($begin+4);
+
+ $_ = <F>;
+ if (m/coveredLength\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s/) {
+ $summary->write($begin+5, 1, "$2", $format);
+ $summary->write($begin+5, 2, "$4", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+
+ $_ = <F>;
+ if (m/coveredLength\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s/) {
+ $summary->write($begin+6, 1, "$2", $format);
+ $summary->write($begin+6, 2, "$4", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+
+ $_ = <F>;
+ if (m/coveredLength\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s/) {
+ $summary->write($begin+7, 1, "$2", $format);
+ $summary->write($begin+7, 2, "$4", $format);
+ } else {
+ die "Parse error $_\n";
+ }
+
+ histogram($begin+8);
+ histogram($begin+9);
+
+ # chromosome covered
+
+ $chrcov->write(0, 1, "all sequence covered", $format_heading);
+ $chrcov->write(0, 2, "all sequence length", $format_heading);
+ $chrcov->write(0, 3, "percent covered", $format_heading);
+
+ $chrcov->write(0, 4, "ACGT sequence covered", $format_heading);
+ $chrcov->write(0, 5, "ACGT sequence length", $format_heading);
+ $chrcov->write(0, 6, "percent covered", $format_heading);
+
+ for (my $i=1; $i<23; $i++) {
+ $chrcov->write($i, 0, "Chr$i", $format_heading);
+ }
+ $chrcov->write(23, 0, "ChrX", $format_heading);
+ $chrcov->write(24, 0, "ChrY", $format_heading);
+ $chrcov->write(25, 0, "ChrMT", $format_heading);
+ $chrcov->write(26, 0, "ChrUn", $format_heading);
+
+ $_ = <F>;
+ while (m/chrCoveredLength\[\s*(\d+)\]\s+\S+\s+(\d+)\s+(\d+)\s+(\d+.\d+)%\s+(\d+)\s+(\d+)\s+(\d+.\d+)%\s+/) {
+ my $i = $1 + 1;
+ my $j = $1 + 2;
+
+ $chrcov->write($i, 1, "$2", $format);
+ $chrcov->write($i, 2, "$3", $format);
+ $chrcov->write($i, 3, "=B$j / C$j * 100", $formatFP);
+ $chrcov->write($i, 4, "$5", $format);
+ $chrcov->write($i, 5, "$6", $format);
+ $chrcov->write($i, 6, "=E$j / F$j * 100", $formatFP);
+
+ $_ = <F>;
+ }
+ }
+}
+close(F);
+
+
+
+sub histogram ($) {
+ my $x = shift @_;
+
+ $_ = <F>;
+ if (m/histogram\s+\S+\s+(\d+)\s+items\s+(\d+.\d+)\s+average\s+(\d+.\d+)\s+std.dev./) {
+ $summary->write($x, 3, "items, average, std.dev.", $format_comment);
+ $summary->write($x, 4, "$1", $format);
+ $summary->write($x, 5, "$2", $formatFP);
+ $summary->write($x, 6, "$3", $formatFP);
+ } else {
+ die "Parse error $_\n";
+ }
+}
+
+
+################################################################################
+#
+# Nx
+#
+my $nx = $workbook->add_worksheet("Nx");
+
+$nx->write(0, 1, "matches", $format_heading);
+$nx->write(0, 2, "runs", $format_heading);
+
+open(A, "< $prefix-matches.Nx");
+open(B, "< $prefix-runs.Nx");
+while (!eof(A)) {
+ my $a = <A>;
+ my $b = <B>;
+ my ($ai, $an) = split '\s+', $a;
+ my ($bi, $bn) = split '\s+', $b;
+ die "Nx error: ai=$ai != bi=$bi\n" if ($ai != $bi);
+ $nx->write($ai, 0, $ai);
+ $nx->write($ai, 1, $an);
+ $nx->write($ai, 2, $bn);
+}
+close(B);
+close(A);
+
+
+################################################################################
+#
+# Histograms of lengths
+#
+#
+my $sheet = $workbook->add_worksheet("Matches Histogram");
+$sheet->set_column(0, 4, 25);
+$sheet->write(0, 1, "$stats{1} length", $format_heading);
+$sheet->write(0, 2, "$stats{2} length", $format_heading);
+$sheet->write(0, 3, "$stats{1} covered N", $format_heading);
+$sheet->write(0, 4, "$stats{2} covered N", $format_heading);
+
+dumpHistogram($sheet,
+ "$prefix-matches.AmatchLength.histogramdat",
+ "$prefix-matches.BmatchLength.histogramdat",
+ "$prefix-matches.AcoveredN.histogramdat",
+ "$prefix-matches.BcoveredN.histogramdat");
+
+
+my $sheet = $workbook->add_worksheet("Runs Histogram");
+$sheet->set_column(0, 4, 25);
+$sheet->write(0, 1, "$stats{1} length", $format_heading);
+$sheet->write(0, 2, "$stats{2} length", $format_heading);
+$sheet->write(0, 3, "$stats{1} covered N", $format_heading);
+$sheet->write(0, 4, "$stats{2} covered N", $format_heading);
+
+dumpHistogram($sheet,
+ "stats-runs.AmatchLength.histogramdat",
+ "stats-runs.BmatchLength.histogramdat",
+ "stats-runs.AcoveredN.histogramdat",
+ "stats-runs.BcoveredN.histogramdat");
+
+my $sheet = $workbook->add_worksheet("Run Missing Histogram");
+$sheet->set_column(0, 4, 25);
+$sheet->write(0, 1, "$stats{1} full missing", $format_heading);
+$sheet->write(0, 3, "$stats{2} full missing", $format_heading);
+$sheet->write(0, 2, "$stats{1} ACGT missing", $format_heading);
+$sheet->write(0, 4, "$stats{2} ACGT missing", $format_heading);
+
+dumpHistogram($sheet,
+ "stats.ARunMissingFull.histogramdat",
+ "stats.BRunMissingFull.histogramdat",
+ "stats.ARunMissingACGT.histogramdat",
+ "stats.BRunMissingACGT.histogramdat");
+
+
+
+sub dumpHistogram {
+ my $sheet = shift @_;
+ my @files = @_;
+ my $col = 1;
+ my $idx = 0;
+ my @range;
+
+ # I can't seem to find any way of deleting a cell once it's
+ # written (opposed to simply clearing the cell). We want
+ # to know what the maximum value in any histogram is, so
+ # we can stop writing after that point.
+ #
+ my $maxIdx = 0;
+
+ foreach my $f (@files) {
+ $idx = 0;
+
+ open(F, "< $f") or die "Failed to open dumpHistogram1 '$f'\n";
+ my @lines = <F>;
+ close(F);
+
+ # Don't use the last line in the file - this is the number
+ # of things bigger than the max, we always report this.
+ pop @lines;
+
+ foreach my $l (@lines) {
+ my ($r, $v) = split '\s+', $l;
+ $maxIdx = $idx if ($v > 0) && ($maxIdx < $idx);
+ $idx++;
+ }
+ }
+
+ # Read the range from the first file -- we'll check that all the other files
+ # use the same range.
+ #
+ $idx = 0;
+ open(F, "< $files[0]") or die "Failed to open dumpHistogram2 '$files[0]'\n";
+ while (<F>) {
+ my ($r, $v) = split '\s+', $_;
+ $range[$idx] = $r;
+ $sheet->write($idx+1, 0, "$r", $format);
+ $idx++;
+ last if ($idx > $maxIdx);
+ }
+ my $lastVal;
+ while (<F>) {
+ my ($r, $v) = split '\s+', $_;
+ $lastVal = $r;
+ }
+ $sheet->write($idx+1, 0, "$lastVal", $format);
+ close(F);
+
+
+
+ foreach my $f (@files) {
+ $idx = 0;
+
+ open(F, "< $f") or die "Failed to open dumpHistogram3 '$f'\n";
+ while (<F>) {
+ my ($r, $v) = split '\s+', $_;
+ die "range error in file '$f' at idx $idx; $range[$idx] != $r\n" if ($range[$idx] != $r);
+ $sheet->write($idx+1, $col, "$v", $format);
+ $idx++;
+ last if ($idx > $maxIdx);
+ }
+ my $lastVal;
+ while (<F>) {
+ my ($r, $v) = split '\s+', $_;
+ $lastVal = $v;
+ }
+ $sheet->write($idx+1, $col, "$lastVal", $format);
+ close(F);
+
+ $col++;
+ }
+}
+
+
+
+################################################################################
+#
+# By chromosome histograms of lengths
+#
+
+my $sheet = $workbook->add_worksheet("Matches Chr Histogram");
+$sheet->set_column(0, 32, 12);
+
+for (my $i=1; $i<23; $i++) {
+ $sheet->write(0, $i, "Chr$i ACGT", $format_heading);
+}
+$sheet->write(0, 23, "ChrX ACGT", $format_heading);
+$sheet->write(0, 24, "ChrY ACGT", $format_heading);
+$sheet->write(0, 25, "ChrMT ACGT", $format_heading);
+$sheet->write(0, 26, "ChrUn ACGT", $format_heading);
+
+dumpHistogram($sheet,
+ "stats-matches.chr00acgt.histogramdat",
+ "stats-matches.chr01acgt.histogramdat",
+ "stats-matches.chr02acgt.histogramdat",
+ "stats-matches.chr03acgt.histogramdat",
+ "stats-matches.chr04acgt.histogramdat",
+ "stats-matches.chr05acgt.histogramdat",
+ "stats-matches.chr06acgt.histogramdat",
+ "stats-matches.chr07acgt.histogramdat",
+ "stats-matches.chr08acgt.histogramdat",
+ "stats-matches.chr09acgt.histogramdat",
+ "stats-matches.chr10acgt.histogramdat",
+ "stats-matches.chr11acgt.histogramdat",
+ "stats-matches.chr12acgt.histogramdat",
+ "stats-matches.chr13acgt.histogramdat",
+ "stats-matches.chr14acgt.histogramdat",
+ "stats-matches.chr15acgt.histogramdat",
+ "stats-matches.chr16acgt.histogramdat",
+ "stats-matches.chr17acgt.histogramdat",
+ "stats-matches.chr18acgt.histogramdat",
+ "stats-matches.chr19acgt.histogramdat",
+ "stats-matches.chr20acgt.histogramdat",
+ "stats-matches.chr21acgt.histogramdat",
+ "stats-matches.chr22acgt.histogramdat",
+ "stats-matches.chr23acgt.histogramdat",
+ "stats-matches.chr24acgt.histogramdat",
+ "stats-matches.chr25acgt.histogramdat");
+
+
+my $sheet = $workbook->add_worksheet("Runs Chr ACGT Histogram");
+$sheet->set_column(0, 32, 12);
+
+for (my $i=1; $i<23; $i++) {
+ $sheet->write(0, $i, "Chr$i ACGT", $format_heading);
+}
+$sheet->write(0, 23, "ChrX ACGT", $format_heading);
+$sheet->write(0, 24, "ChrY ACGT", $format_heading);
+$sheet->write(0, 25, "ChrMT ACGT", $format_heading);
+$sheet->write(0, 26, "ChrUn ACGT", $format_heading);
+
+dumpHistogram($sheet,
+ "stats-runs.chr00acgt.histogramdat",
+ "stats-runs.chr01acgt.histogramdat",
+ "stats-runs.chr02acgt.histogramdat",
+ "stats-runs.chr03acgt.histogramdat",
+ "stats-runs.chr04acgt.histogramdat",
+ "stats-runs.chr05acgt.histogramdat",
+ "stats-runs.chr06acgt.histogramdat",
+ "stats-runs.chr07acgt.histogramdat",
+ "stats-runs.chr08acgt.histogramdat",
+ "stats-runs.chr09acgt.histogramdat",
+ "stats-runs.chr10acgt.histogramdat",
+ "stats-runs.chr11acgt.histogramdat",
+ "stats-runs.chr12acgt.histogramdat",
+ "stats-runs.chr13acgt.histogramdat",
+ "stats-runs.chr14acgt.histogramdat",
+ "stats-runs.chr15acgt.histogramdat",
+ "stats-runs.chr16acgt.histogramdat",
+ "stats-runs.chr17acgt.histogramdat",
+ "stats-runs.chr18acgt.histogramdat",
+ "stats-runs.chr19acgt.histogramdat",
+ "stats-runs.chr20acgt.histogramdat",
+ "stats-runs.chr21acgt.histogramdat",
+ "stats-runs.chr22acgt.histogramdat",
+ "stats-runs.chr23acgt.histogramdat",
+ "stats-runs.chr24acgt.histogramdat",
+ "stats-runs.chr25acgt.histogramdat");
+
+
+
+
+my $sheet = $workbook->add_worksheet("Runs Chr Full Histogram");
+$sheet->set_column(0, 32, 12);
+
+for (my $i=1; $i<23; $i++) {
+ $sheet->write(0, $i, "Chr$i Full", $format_heading);
+}
+$sheet->write(0, 23, "ChrX FULL", $format_heading);
+$sheet->write(0, 24, "ChrY FULL", $format_heading);
+$sheet->write(0, 25, "ChrMT FULL", $format_heading);
+$sheet->write(0, 26, "ChrUn FULL", $format_heading);
+
+dumpHistogram($sheet,
+ "stats-runs.chr00full.histogramdat",
+ "stats-runs.chr01full.histogramdat",
+ "stats-runs.chr02full.histogramdat",
+ "stats-runs.chr03full.histogramdat",
+ "stats-runs.chr04full.histogramdat",
+ "stats-runs.chr05full.histogramdat",
+ "stats-runs.chr06full.histogramdat",
+ "stats-runs.chr07full.histogramdat",
+ "stats-runs.chr08full.histogramdat",
+ "stats-runs.chr09full.histogramdat",
+ "stats-runs.chr10full.histogramdat",
+ "stats-runs.chr11full.histogramdat",
+ "stats-runs.chr12full.histogramdat",
+ "stats-runs.chr13full.histogramdat",
+ "stats-runs.chr14full.histogramdat",
+ "stats-runs.chr15full.histogramdat",
+ "stats-runs.chr16full.histogramdat",
+ "stats-runs.chr17full.histogramdat",
+ "stats-runs.chr18full.histogramdat",
+ "stats-runs.chr19full.histogramdat",
+ "stats-runs.chr20full.histogramdat",
+ "stats-runs.chr21full.histogramdat",
+ "stats-runs.chr22full.histogramdat",
+ "stats-runs.chr23full.histogramdat",
+ "stats-runs.chr24full.histogramdat",
+ "stats-runs.chr25full.histogramdat");
+
+
diff --git a/atac-driver/statsGenerator/statsGenerator.C b/atac-driver/statsGenerator/statsGenerator.C
new file mode 100644
index 0000000..b66f123
--- /dev/null
+++ b/atac-driver/statsGenerator/statsGenerator.C
@@ -0,0 +1,755 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005, 2006 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+// Compute some simple statistics on a set of matches
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+
+#include "atac.H"
+#include "util++.H"
+#include "bio++.H"
+#include "seqCache.H"
+
+bool noHistogramPlots = true;
+
+
+// Sort uint32 backwards
+int
+uint32compare(const void *a, const void *b) {
+ const uint32 A = *((const uint32 *)a);
+ const uint32 B = *((const uint32 *)b);
+ if (A < B) return(1);
+ if (A > B) return(-1);
+ return(0);
+}
+
+
+class histogram {
+public:
+ histogram(uint64 blockSize, uint64 maxSize) {
+ _b = blockSize;
+ _m = maxSize;
+ _l = 0;
+ _h = new uint32 [maxSize / blockSize + 1];
+ for (uint32 i=0; i<maxSize / blockSize + 1; i++)
+ _h[i] = 0;
+ _eLen = 0;
+ _eMax = 10240;
+ _e = new uint32 [_eMax];
+ };
+ ~histogram() {
+ delete [] _h;
+ delete [] _e;
+ };
+
+ void add(uint64 x) {
+ if (_eLen >= _eMax) {
+ _eMax *= 2;
+ uint32 *e = new uint32 [_eMax];
+ memcpy(e, _e, sizeof(uint32) * _eLen);
+ delete [] _e;
+ _e = e;
+ }
+ _e[_eLen++] = x;
+
+ if (x > _m)
+ _l++;
+ else
+ _h[x/_b]++;
+ };
+
+ void show(char const *label) {
+ double average = 0;
+ double stddev = 0;
+
+ for (uint32 i=0; i<_eLen; i++)
+ average += _e[i];
+ average /= _eLen;
+
+ for (uint32 i=0; i<_eLen; i++)
+ stddev += (_e[i] - average) * (_e[i] - average);
+ stddev = sqrt(stddev / _eLen);
+
+ fprintf(stdout, "histogram %s "uint32FMT" items %8.3f average %8.3f std.dev.\n",
+ label, _eLen, average, stddev);
+ };
+
+ void dump(char const *prefix, char const *label) {
+ if (noHistogramPlots)
+ return;
+ char filename[1024];
+ sprintf(filename, "%s.%s.histogramdat", prefix, label);
+ FILE *out = fopen(filename, "w");
+ for (uint64 i=0; i<_m / _b; i++)
+ fprintf(out, uint64FMT" "uint32FMT"\n", i * _b, _h[i]);
+ fprintf(out, ">"uint64FMT" "uint32FMT"\n", _m, _l);
+ fclose(out);
+ }
+
+ void plot(char const *prefix, char const *label) {
+
+ if (noHistogramPlots)
+ return;
+
+ // Find max's of the data
+ uint64 maxx = 0;
+ uint64 maxy = 0;
+
+ for (uint64 i=0; i<_m / _b; i++) {
+ if (_h[i] > 0)
+ maxx = i * _b;
+ if (maxy < _h[i])
+ maxy = _h[i];
+ }
+
+ if ((maxx == 0) || (maxy == 0))
+ return;
+
+ char filename[1024];
+ sprintf(filename, "%s.%s.histogram.gnuplot", prefix, label);
+ FILE *out = fopen(filename, "w");
+ fprintf(out, "set terminal postscript color\n");
+ fprintf(out, "set output \"%s.%s.histogram.ps\"\n", prefix, label);
+ fprintf(out, "set xlabel \"length bp\"\n");
+ fprintf(out, "set ylabel \"number of matches\"\n");
+ fprintf(out, "plot [0:"uint64FMT"][0:"uint64FMT"] \"%s.%s.histogramdat\" using 2 with lines\n",
+ maxx, maxy, prefix, label);
+ fprintf(out, "set output \"%s.%s.histogram.closeup.ps\"\n", prefix, label);
+ fprintf(out, "plot [0:"uint64FMT"][0:"uint64FMT"] \"%s.%s.histogramdat\" using 2 with lines\n",
+ maxx/10, maxy, prefix, label);
+ fprintf(out, "quit\n");
+ fclose(out);
+ sprintf(filename, "gnuplot < %s.%s.histogram.gnuplot", prefix, label);
+ if (system(filename))
+ fprintf(stderr, "Failed to execute '%s'\n", filename);
+ };
+
+
+private:
+ uint64 _b; // blockSize
+ uint64 _m; // maximum element size
+ uint32 _l; // number of things bigger than _m
+ uint32 *_h; // the histogram
+ uint32 _eMax;
+ uint32 _eLen;
+ uint32 *_e; // the elements -- for computing the stats;
+};
+
+
+
+// Compute the total gapped and ungapped length of the input
+// sequences. Uses atacMatchList only to access the underlying fasta
+// sequences.
+//
+void
+totalLength(atacFile &AF, seqCache *A, seqCache *B) {
+ uint64 length1 = 0;
+ uint64 length2 = 0;
+
+ for (uint32 i=0; i<A->getNumberOfSequences(); i++)
+ length1 += A->getSequenceLength(i);
+ for (uint32 i=0; i<B->getNumberOfSequences(); i++)
+ length2 += B->getSequenceLength(i);
+
+ fprintf(stdout, "totalLength %s "uint64FMT" %s "uint64FMT" # all letters, including N\n",
+ AF.labelA(), length1,
+ AF.labelB(), length2);
+
+ length1 = 0;
+ length2 = 0;
+ for (uint32 i=0; i<A->getNumberOfSequences(); i++) {
+ seqInCore *S = A->getSequenceInCore(i);
+ char *s = S->sequence();
+ for (uint32 j=0; j<S->sequenceLength(); j++)
+ if (letterToBits[s[j]] != 0xff)
+ length1++;
+ }
+ for (uint32 i=0; i<B->getNumberOfSequences(); i++) {
+ seqInCore *S = B->getSequenceInCore(i);
+ char *s = S->sequence();
+ for (uint32 j=0; j<S->sequenceLength(); j++)
+ if (letterToBits[s[j]] != 0xff)
+ length2++;
+ }
+
+ fprintf(stdout, "totalLength %s "uint64FMT" %s "uint64FMT" # ACGT only\n",
+ AF.labelA(), length1,
+ AF.labelB(), length2);
+}
+
+
+
+
+uint64
+tandemRepeatACGTLength(intervalList<uint64> &il,
+ uint64 *offset,
+ seqCache *A) {
+
+ // s -- the sequence
+ // i -- the interval list index
+
+ il.merge();
+ uint64 length = 0;
+ uint64 unknown[256] = {0};
+ for (uint32 i=0, s=0; i<il.numberOfIntervals(); i++) {
+ while ((offset[s + 1]) <= il.lo(i))
+ s++;
+
+ char *S = A->getSequenceInCore(s)->sequence();
+
+ uint64 lo = il.lo(i) - offset[s];
+ uint64 hi = il.hi(i) - offset[s];
+
+ for (uint64 j=lo; j < hi; j++)
+ if (letterToBits[S[j]] != 0xff)
+ length++;
+ else
+ unknown[S[j]]++;
+ }
+
+ //fprintf(stderr, "tandemRepeatACGTLength: "uint64FMT"\n", length);
+ //for (uint32 i=0; i<256; i++)
+ // if (unknown[i] > 0)
+ // fprintf(stderr, "tandemRepeatACGTLength["uint32FMT"] = "uint64FMT" (%c)\n", i, unknown[i], i);
+
+ return(length);
+}
+
+
+uint64 *
+buildOffset(seqCache *F) {
+ uint64 *offset = new uint64 [F->getNumberOfSequences() + 1];
+ offset[0] = 1000000;
+ for (uint32 i=0; i<F->getNumberOfSequences(); i++)
+ offset[i+1] = offset[i] + F->getSequenceLength(i) + 1;
+ return(offset);
+}
+
+
+void
+tandemRepeatStats(atacFileStream &featuresA,
+ atacFileStream &featuresB,
+ atacFile &AF,
+ seqCache *A,
+ seqCache *B) {
+ intervalList<uint64> ifa, ifb;
+ intervalList<uint64> ima, imb;
+ intervalList<uint64> mma, mmb;
+
+ atacMatchList &matches = *AF.matches();
+
+ uint64 *offset1 = buildOffset(A);
+ uint64 *offset2 = buildOffset(B);
+
+ // ifa, ifb are intervalLists, storing the intervals labeled as
+ // tandem repeats. They are using the offset[] to encode the
+ // entire sequence as one consecutive string.
+ //
+ atacFeature *f = 0L;
+ while ((f = featuresA.nextFeature("tr")) != 0L)
+ ifa.add(offset1[f->iid] + f->pos, f->len);
+ while ((f = featuresB.nextFeature("tr")) != 0L)
+ ifb.add(offset2[f->iid] + f->pos, f->len);
+
+
+ // ima, imb, like if?, encode the matches in one string.
+ //
+ for (uint32 m=0; m<matches.numberOfMatches(); m++)
+ ima.add(offset1[matches[m]->iid1] + (uint64)matches[m]->pos1, (uint64)matches[m]->len1);
+ for (uint32 m=0; m<matches.numberOfMatches(); m++)
+ imb.add(offset2[matches[m]->iid2] + (uint64)matches[m]->pos2, (uint64)matches[m]->len2);
+
+
+ fprintf(stdout, "\nTANDEM REPEATS in %s\n", AF.labelA());
+ fprintf(stdout, "numberOfItems "uint64FMT"\n", (uint64)ifa.numberOfIntervals());
+ fprintf(stdout, "totalLength "uint64FMT" # sum of lengths of all features\n", ifa.sumOfLengths());
+ ifa.merge();
+ fprintf(stdout, "numberOfItems "uint64FMT" # after merging overlapping regions\n", (uint64)ifa.numberOfIntervals());
+ fprintf(stdout, "coveredLength "uint64FMT" # sequence covered by a feature, including N\n", ifa.sumOfLengths());
+ fprintf(stdout, "coveredLength "uint64FMT" # sequence covered by a feature, ACGT only\n", tandemRepeatACGTLength(ifa, offset1, A));
+ mma.intersect(ifa, ima);
+ fprintf(stdout, "numberOfItems "uint64FMT" # after merging overlapping regions, only in matches\n", (uint64)mma.numberOfIntervals());
+ fprintf(stdout, "inMatches "uint64FMT" # sequence covered by a feature and in a match, including N\n", mma.sumOfLengths());
+ fprintf(stdout, "inMatches "uint64FMT" # sequence covered by a feature and in a match, ACGT only\n", tandemRepeatACGTLength(mma, offset1, A));
+
+
+ fprintf(stdout, "\nTANDEM REPEATS in %s\n", AF.labelB());
+ fprintf(stdout, "numberOfItems "uint64FMT"\n", (uint64)ifb.numberOfIntervals());
+ fprintf(stdout, "totalLength "uint64FMT" # sum of lengths of all features\n", ifb.sumOfLengths());
+ ifb.merge();
+ fprintf(stdout, "numberOfItems "uint64FMT" # after merging overlapping regions\n", (uint64)ifb.numberOfIntervals());
+ fprintf(stdout, "coveredLength "uint64FMT" # sequence covered by a feature, including N\n", ifb.sumOfLengths());
+ fprintf(stdout, "coveredLength "uint64FMT" # sequence covered by a feature, ACGT only\n", tandemRepeatACGTLength(ifb, offset2, B));
+ mmb.intersect(ifb, imb);
+ fprintf(stdout, "numberOfItems "uint64FMT" # after merging overlapping regions, only in matches\n", (uint64)mmb.numberOfIntervals());
+ fprintf(stdout, "inMatches "uint64FMT" # sequence covered by a feature and in a match, including N\n", mmb.sumOfLengths());
+ fprintf(stdout, "inMatches "uint64FMT" # sequence covered by a feature and in a match, ACGT only\n", tandemRepeatACGTLength(mmb, offset2, B));
+
+ delete [] offset1;
+ delete [] offset2;
+}
+
+
+
+void
+mappedLengths(atacFile &AF, atacMatchList &matches, seqCache *A, seqCache *B, char *prefix) {
+ histogram h1(100, 1000000);
+ histogram h2(100, 1000000);
+
+ // For the coverage to work correctly, we need to either have one
+ // intervalList per input sequence, or build a table of the chained
+ // sequence positions.
+ //
+ uint64 *offset1 = buildOffset(AF.fastaA());
+ uint64 *offset2 = buildOffset(AF.fastaB());
+
+ intervalList<uint64> intervalA;
+ intervalList<uint64> intervalB;
+
+ for (uint32 m=0; m<matches.numberOfMatches(); m++) {
+ intervalA.add(offset1[matches[m]->iid1] + (uint64)matches[m]->pos1, (uint64)matches[m]->len1);
+ intervalB.add(offset2[matches[m]->iid2] + (uint64)matches[m]->pos2, (uint64)matches[m]->len2);
+
+ h1.add(matches[m]->len1);
+ h2.add(matches[m]->len2);
+ }
+
+ fprintf(stdout, "numberOfItems "uint64FMT"\n", (uint64)matches.numberOfMatches());
+
+ fprintf(stdout, "matchLength %s "uint64FMT" %s "uint64FMT" # Sum of lengths of sequence in matches\n",
+ AF.labelA(), (uint64)intervalA.sumOfLengths(),
+ AF.labelB(), (uint64)intervalB.sumOfLengths());
+
+ h1.show("AmatchLength");
+ h2.show("BmatchLength");
+ h1.dump(prefix, "AmatchLength"); h1.plot(prefix, "AmatchLength");
+ h2.dump(prefix, "BmatchLength"); h2.plot(prefix, "BmatchLength");
+
+ intervalA.merge();
+ intervalB.merge();
+
+ fprintf(stdout, "coveredLength %s "uint64FMT" %s "uint64FMT" # sequence covered by a match, including N\n",
+ AF.labelA(), (uint64)intervalA.sumOfLengths(),
+ AF.labelB(), (uint64)intervalB.sumOfLengths());
+
+ fprintf(stdout, "coveredLength %s "uint64FMT" %s "uint64FMT" # sequence covered by a match, ACGT only (new)\n",
+ AF.labelA(), tandemRepeatACGTLength(intervalA, offset1, A),
+ AF.labelB(), tandemRepeatACGTLength(intervalB, offset2, B));
+
+ delete [] offset1;
+ delete [] offset2;
+}
+
+
+
+// Generate an Nx plot
+void
+NxOfMapped(atacFile &AF,
+ atacMatchList &matches,
+ uint64 genomeSize,
+ char *prefix) {
+
+ uint32 *n50 = new uint32 [matches.numberOfMatches()];
+
+ for (uint32 i=0; i<matches.numberOfMatches(); i++)
+ n50[i] = matches[i]->len1;
+
+ // Compute the total length of the sequence
+ uint64 totalLength = 0;
+ switch (genomeSize) {
+ case 0:
+ for (uint32 i=0; i<AF.fastaA()->getNumberOfSequences(); i++)
+ totalLength += AF.fastaA()->getSequenceLength(i);
+ break;
+ case 1:
+ for (uint32 i=0; i<AF.fastaB()->getNumberOfSequences(); i++)
+ totalLength += AF.fastaB()->getSequenceLength(i);
+ break;
+ default:
+ totalLength = genomeSize;
+ break;
+ }
+
+ // Sort the n50 list of lengths
+ qsort(n50, matches.numberOfMatches(), sizeof(uint32), uint32compare);
+
+ // It's slow and obvious and, yes, there is a better way. Dump the
+ // Nx plot as it's being generated.
+ //
+ char filename[1024];
+ sprintf(filename, "%s.Nx", prefix);
+ FILE *out = fopen(filename, "w");
+
+ for (uint64 n=1; n<100; n++) {
+ uint64 limit = totalLength / 100 * n;
+ uint64 iter = 0;
+ uint64 sum = 0;
+
+ while ((sum < limit) && (iter < matches.numberOfMatches()))
+ sum += n50[iter++];
+
+ fprintf(out, uint64FMT" "uint32FMT"\n", n, n50[iter-1]);
+ }
+
+ fclose(out);
+
+ // Now plot it.
+ //
+ if (noHistogramPlots == false) {
+ sprintf(filename, "%s.Nx.gnuplot", prefix);
+ out = fopen(filename, "w");
+ fprintf(out, "set terminal postscript color\n");
+ fprintf(out, "set output \"%s.Nx.ps\"\n", prefix);
+ fprintf(out, "set xlabel \"N\"\n");
+ fprintf(out, "set ylabel \"match length\"\n");
+ fprintf(out, "plot \"%s.Nx\" using 2 with lines\n", prefix);
+ fclose(out);
+ sprintf(filename, "gnuplot < %s.Nx.gnuplot", prefix);
+ if (system(filename))
+ fprintf(stderr, "Failed to execute '%s'\n", filename);
+ }
+
+ delete [] n50;
+}
+
+
+// Computes the percentage of each chromosome (assumes chromosomes are A)
+// that is mapped, with and without N's.
+//
+void
+MappedByChromosome(atacFile &AF,
+ atacMatchList &matches,
+ seqCache *A,
+ seqCache *B,
+ char *prefix) {
+
+ uint32 maxIID1 = A->getNumberOfSequences();
+ intervalList<uint64> *il1full;
+ intervalList<uint64> *il1acgt;
+ histogram **hist1full;
+ histogram **hist1acgt;
+
+ if (A->getNumberOfSequences() > 24) {
+ fprintf(stderr, "WARNING: too many sequences to be chromosomes, only using the first 24.\n");
+ maxIID1 = 24;
+ }
+
+ // We could cache this when we compute the totalLength() above
+ uint64 *nonNlength = new uint64 [maxIID1+1];
+ for (uint32 i=0; i<maxIID1; i++) {
+ seqInCore *S = A->getSequenceInCore(i);
+ char *s = S->sequence();
+ nonNlength[i] = 0;
+ for (uint32 j=0; j<S->sequenceLength(); j++)
+ if (letterToBits[s[j]] != 0xff)
+ nonNlength[i]++;
+ }
+
+ il1full = new intervalList<uint64> [maxIID1 + 1];
+ il1acgt = new intervalList<uint64> [maxIID1 + 1];
+
+ hist1full = new histogram * [maxIID1 + 1];
+ hist1acgt = new histogram * [maxIID1 + 1];
+
+ for (uint32 i=0; i<maxIID1; i++) {
+ hist1full[i] = new histogram(100, 1000000);
+ hist1acgt[i] = new histogram(100, 1000000);
+ }
+
+ for (uint32 m=0; m<matches.numberOfMatches(); m++) {
+ if (matches[m]->iid1 < maxIID1) {
+ il1full[matches[m]->iid1].add(matches[m]->pos1, matches[m]->len1);
+ hist1full[matches[m]->iid1]->add(matches[m]->len1);
+
+ seqInCore *Sa = A->getSequenceInCore(matches[m]->iid1);
+ char *sa = Sa->sequence() + matches[m]->pos1;
+
+ uint32 length = 0;
+
+ for (uint32 j=0; j<matches[m]->len1; j++) {
+ bool invalid = (letterToBits[sa[j]] == 0xff);
+
+ if (!invalid)
+ length++;
+
+ if (length && invalid) { // Last time we were ACGT, this time not.
+ il1acgt[matches[m]->iid1].add(matches[m]->pos1 + j - length, length);
+ hist1acgt[matches[m]->iid1]->add(length);
+ length = 0;
+ }
+ }
+ if (length) {
+ il1acgt[matches[m]->iid1].add(matches[m]->pos1 + matches[m]->len1 - length, length);
+ hist1acgt[matches[m]->iid1]->add(length);
+ }
+ }
+ }
+
+ for (uint32 c=0; c<maxIID1; c++) {
+ fprintf(stdout, "chrCoveredLength["uint32FMTW(2)"] %s "uint64FMT" "uint64FMT" %6.2f%% "uint64FMT" "uint64FMT" %6.2f%% # seqCov, totalSeq for both ALL and ACGTonly\n",
+ c, AF.labelA(),
+ il1full[c].sumOfLengths(), (uint64)A->getSequenceLength(c), 100.0 * il1full[c].sumOfLengths() / A->getSequenceLength(c),
+ il1acgt[c].sumOfLengths(), nonNlength[c], 100.0 * il1acgt[c].sumOfLengths() / nonNlength[c]);
+ }
+
+ for (uint32 c=0; c<maxIID1; c++) {
+ char label[1024];
+
+ sprintf(label, "chr"uint32FMTW(02)"full", c);
+ hist1full[c]->dump(prefix, label);
+ hist1full[c]->plot(prefix, label);
+
+ sprintf(label, "chr"uint32FMTW(02)"acgt", c);
+ hist1acgt[c]->dump(prefix, label);
+ hist1acgt[c]->plot(prefix, label);
+ }
+
+ delete [] il1full;
+ delete [] il1acgt;
+ for (uint32 i=0; i<maxIID1; i++) {
+ delete hist1full[i];
+ delete hist1acgt[i];
+ }
+ delete [] hist1full;
+ delete [] hist1acgt;
+ delete [] nonNlength;
+}
+
+
+
+
+
+
+void
+statsInACGT(seqInCore *S,
+ uint32 beg,
+ uint32 len,
+ intervalList<uint64> *IL,
+ histogram *HI) {
+ char *s = S->sequence() + beg;
+ uint32 length = 0;
+
+ for (uint32 j=0; j<len; j++) {
+ bool invalid = (letterToBits[s[j]] == 0xff);
+
+ if (!invalid)
+ length++;
+
+ if (length && invalid) { // Last time we were ACGT, this time not.
+ if (IL) IL->add(beg + j - length, length);
+ if (HI) HI->add(length);
+ length = 0;
+ }
+ }
+ if (length) {
+ if (IL) IL->add(beg + len - length, length);
+ if (HI) HI->add(length);
+ }
+}
+
+
+
+
+// Computes the amount of ACGT in runs that is unmapped
+//
+void
+unmappedInRuns(atacFile &AF, seqCache *A, seqCache *B, char *prefix) {
+
+ atacMatchList &matches = *AF.matches();
+
+ // We must sort by the location and not the parentID; when we
+ // stream through, we check that the pair of matches are in the
+ // same parent.
+ //
+ atacMatchOrder MO(matches);
+ MO.sortA();
+
+ intervalList<uint64> il1full, il2full;
+ intervalList<uint64> il1acgt, il2acgt;
+
+ histogram hist1full(100, 1000000), hist2full(100, 1000000);
+ histogram hist1acgt(100, 1000000), hist2acgt(100, 1000000);
+
+ for (uint32 i=1; i<MO.numberOfMatches(); i++) {
+ if (strcmp(MO[i-1]->parentuid, MO[i]->parentuid) == 0) {
+ uint32 l1, r1, l2, r2;
+
+ if (MO[i]->fwd2 == 1) {
+ l1 = MO[i-1]->pos1 + MO[i-1]->len1;
+ r1 = MO[i]->pos1;
+ l2 = MO[i-1]->pos2 + MO[i-1]->len2;
+ r2 = MO[i]->pos2;
+ } else {
+ l1 = MO[i-1]->pos1 + MO[i-1]->len1;
+ r1 = MO[i]->pos1;
+ l2 = MO[i]->pos2 + MO[i]->len2;
+ r2 = MO[i-1]->pos2;
+ }
+
+ il1full.add(l1, r1-l1);
+ il2full.add(l2, r2-l2);
+
+ hist1full.add(r1-l1);
+ hist2full.add(r2-l2);
+
+ statsInACGT(A->getSequenceInCore(MO[i]->iid1),
+ l1,
+ r1-l1,
+ &il1acgt,
+ &hist1acgt);
+ statsInACGT(B->getSequenceInCore(MO[i]->iid2),
+ l2,
+ r2-l2,
+ &il2acgt,
+ &hist2acgt);
+ }
+ }
+
+ // Dump the stats
+
+ fprintf(stdout, "runMissingFull %s "uint64FMT" %s "uint64FMT" # sequence in run, not covered, including N\n",
+ AF.labelA(), (uint64)il1full.sumOfLengths(),
+ AF.labelB(), (uint64)il2full.sumOfLengths());
+ fprintf(stdout, "runMissingFull %s "uint64FMT" %s "uint64FMT" # sequence in run, not covered, ACGT only\n",
+ AF.labelA(), (uint64)il1acgt.sumOfLengths(),
+ AF.labelB(), (uint64)il2acgt.sumOfLengths());
+
+ hist1full.dump(prefix, "ARunMissingFull");
+ hist1full.plot(prefix, "ARunMissingFull");
+
+ hist2full.dump(prefix, "BRunMissingFull");
+ hist2full.plot(prefix, "BRunMissingFull");
+
+ hist1acgt.dump(prefix, "ARunMissingACGT");
+ hist1acgt.plot(prefix, "ARunMissingACGT");
+
+ hist2acgt.dump(prefix, "BRunMissingACGT");
+ hist2acgt.plot(prefix, "BRunMissingACGT");
+}
+
+
+
+int
+main(int argc, char **argv) {
+ uint64 genomeSize = 0;
+ char *atacFileName = 0L;
+ char *prefix = 0L;
+ char *trFile1 = 0L;
+ char *trFile2 = 0L;
+ char prefixFull[1024];
+ bool error = false;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-g") == 0) {
+ ++arg;
+ if (argv[arg][0] == 'A') {
+ genomeSize = 0;
+ } else if (argv[arg][0] == 'B') {
+ genomeSize = 1;
+ } else {
+ genomeSize = strtouint64(argv[arg], 0L);
+ }
+ } else if (strcmp(argv[arg], "-a") == 0) {
+ atacFileName = argv[++arg];
+ } else if (strcmp(argv[arg], "-p") == 0) {
+ prefix = argv[++arg];
+ } else if (strcmp(argv[arg], "-ta") == 0) {
+ trFile1 = argv[++arg];
+ } else if (strcmp(argv[arg], "-tb") == 0) {
+ trFile2 = argv[++arg];
+ } else {
+ error = true;
+ }
+ arg++;
+ }
+
+ if (!atacFileName || !prefix || error) {
+ fprintf(stderr, "usage: %s -a <file.atac> -p <outprefix> [-ta trfile] [-tb trfile] [-g {A | B | g}]\n", argv[0]);
+ fprintf(stderr, " -a read input from 'file.atac'\n");
+ fprintf(stderr, " -p write stats to files prefixed with 'outprefix'\n");
+ fprintf(stderr, " -g use a genome size of g for the Nx computation, defaults to\n");
+ fprintf(stderr, " the length of the A sequence. Or use the actual length\n");
+ fprintf(stderr, " of sequence A or B.\n");
+ fprintf(stderr, " -ta read tandem repeats for A from trfile\n");
+ fprintf(stderr, " -tb read tandem repeats for B from trfile\n");
+ exit(1);
+ }
+
+ atacFile AF(atacFileName);
+ atacMatchList &matches = *AF.matches();
+ atacMatchList &runs = *AF.runs();
+ atacMatchList &clumps = *AF.clumps();
+
+ // We end up using sequences a lot here, so just bite it and load them in a cache.
+ //
+ seqCache *A = new seqCache(AF.assemblyFileA(), 0, true);
+ seqCache *B = new seqCache(AF.assemblyFileB(), 0, true);
+
+ A->loadAllSequences();
+ B->loadAllSequences();
+
+ fprintf(stdout, "\nSEQUENCE\n");
+ totalLength(AF, A, B);
+
+ if (trFile1 && trFile2) {
+ atacFileStream tr1(trFile1);
+ atacFileStream tr2(trFile2);
+ tandemRepeatStats(tr1, tr2, AF, A, B);
+ }
+
+ // XXX unmappedInRuns only works on runs, and if we have clumps in
+ // the input it fails.
+ //
+ if ((runs.numberOfMatches() > 0) && (clumps.numberOfMatches() == 0)) {
+ fprintf(stdout, "\nMATCHES IN RUNS\n");
+ unmappedInRuns(AF, A, B, prefix);
+ }
+
+ if (matches.numberOfMatches() > 0) {
+ fprintf(stdout, "\nMATCHES\n");
+ sprintf(prefixFull, "%s-matches", prefix);
+ mappedLengths(AF, matches, A, B, prefixFull);
+ NxOfMapped(AF, matches, genomeSize, prefixFull);
+ MappedByChromosome(AF, matches, A, B, prefixFull);
+ }
+
+ if (runs.numberOfMatches() > 0) {
+ fprintf(stdout, "\nRUNS\n");
+ sprintf(prefixFull, "%s-runs", prefix);
+ mappedLengths(AF, runs, A, B, prefixFull);
+ NxOfMapped(AF, runs, genomeSize, prefixFull);
+ MappedByChromosome(AF, runs, A, B, prefixFull);
+ }
+
+ if (clumps.numberOfMatches() > 0) {
+ fprintf(stdout, "\nCLUMPS\n");
+ sprintf(prefixFull, "%s-clumps", prefix);
+ mappedLengths(AF, clumps, A, B, prefixFull);
+ NxOfMapped(AF, clumps, genomeSize, prefixFull);
+ MappedByChromosome(AF, clumps, A, B, prefixFull);
+ }
+
+ delete A;
+ delete B;
+
+ return(0);
+}
diff --git a/atac-driver/test/uf-test-1f.atac b/atac-driver/test/uf-test-1f.atac
new file mode 100644
index 0000000..3ae3626
--- /dev/null
+++ b/atac-driver/test/uf-test-1f.atac
@@ -0,0 +1,10 @@
+! format atac 1.0
+
+#
+# contained and kill
+#
+M u 001f . A:0 30 90 1 B:0 10 90 1
+M u 666a . A:0 40 10 1 B:0 900 10 1
+M u 666b . A:0 900 20 1 B:0 40 20 1
+M u 666c . A:0 80 10 1 B:0 920 10 1
+M u 666d . A:0 920 10 1 B:0 90 10 1
diff --git a/atac-driver/test/uf-test-1r.atac b/atac-driver/test/uf-test-1r.atac
new file mode 100644
index 0000000..d4e75e2
--- /dev/null
+++ b/atac-driver/test/uf-test-1r.atac
@@ -0,0 +1,10 @@
+! format atac 1.0
+
+#
+# contained and kill, reverse
+#
+M u 001f . A:0 30 90 1 B:0 10 90 -1
+M u 666a . A:0 40 10 1 B:0 900 10 1
+M u 666b . A:0 900 20 1 B:0 10 20 1
+M u 666c . A:0 80 10 1 B:0 920 10 1
+M u 666d . A:0 920 10 1 B:0 70 10 1
diff --git a/atac-driver/test/uf-test-2.atac b/atac-driver/test/uf-test-2.atac
new file mode 100644
index 0000000..06b1bc0
--- /dev/null
+++ b/atac-driver/test/uf-test-2.atac
@@ -0,0 +1,16 @@
+! format atac 1.0
+
+# Left, right edges, forward
+#
+M u 002f . A:2 10 30 1 B:2 30 30 1
+M u 003f . A:2 60 60 1 B:2 50 60 1
+M u 004f . A:2 100 50 1 B:2 120 50 1
+M u 005f . A:2 160 40 1 B:2 160 40 1
+M u 006f . A:2 210 50 1 B:2 180 50 1
+
+# Left, right edges, reverse
+#
+M u 010r . A:3 10 70 1 B:3 30 70 -1
+M u 011r . A:3 60 60 1 B:3 140 60 -1
+M u 012r . A:3 140 90 1 B:3 210 90 -1
+M u 013r . A:3 240 120 1 B:3 240 120 -1
diff --git a/atac-driver/test/uf-test-3.atac b/atac-driver/test/uf-test-3.atac
new file mode 100644
index 0000000..819fbe1
--- /dev/null
+++ b/atac-driver/test/uf-test-3.atac
@@ -0,0 +1,27 @@
+! format atac 1.0
+
+#
+# edge effects
+#
+# anchored on the A sequence, second match on the inside, third match on the outside.
+#
+# anchored on A, i, o
+# anchored on A, o, i
+# anchored on B, i, o
+# anchored on B, o, i
+
+M u 011 . A:0 400 200 1 B:0 400 200 1
+M u 012 . A:0 400 100 1 B:0 200 100 1
+M u 013 . A:0 600 100 1 B:0 800 100 1
+#
+M u 014 . A:1 400 200 1 B:1 400 200 1
+M u 015 . A:1 300 100 1 B:1 200 100 1
+M u 016 . A:1 500 100 1 B:1 800 100 1
+#
+M u 017 . A:2 400 200 1 B:2 400 200 1
+M u 018 . A:2 200 100 1 B:2 400 100 1
+M u 019 . A:2 800 100 1 B:2 600 100 1
+#
+M u 020 . A:3 400 200 1 B:3 400 200 1
+M u 021 . A:3 200 100 1 B:3 300 100 1
+M u 022 . A:3 800 100 1 B:3 500 100 1
diff --git a/atac-driver/uniqueFilter/Make.include b/atac-driver/uniqueFilter/Make.include
new file mode 100644
index 0000000..2771d0c
--- /dev/null
+++ b/atac-driver/uniqueFilter/Make.include
@@ -0,0 +1,16 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBSEQ/ :=$(realpath $/../../libseq/)/
+LIBATAC/ :=$(realpath $/../libatac/)/
+
+$/.CXX_EXES := $/uniqueFilter
+$/.CXX_SRCS := $/uniqueFilter.C
+
+$/.CLEAN :=$/*.o $/*~ $/core
+
+$/uniqueFilter: $/uniqueFilter.o \
+ ${LIBATAC/}libatac.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBATAC/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/atac-driver/uniqueFilter/uniqueFilter.C b/atac-driver/uniqueFilter/uniqueFilter.C
new file mode 100644
index 0000000..824b867
--- /dev/null
+++ b/atac-driver/uniqueFilter/uniqueFilter.C
@@ -0,0 +1,842 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "util++.H"
+#include "atac.H"
+
+// Kaz Kylheku <kaz at ashi.footprints.net> library.
+#include "kazlib/dict.h"
+#include "kazlib/except.h"
+#include "kazlib/hash.h"
+#include "kazlib/list.h"
+#include "kazlib/sfx.h"
+
+// Filters out matches that have non-unique pieces. Does not discard
+// the whole match, but just trims out the non-unique section.
+//
+// Original implementation in Python by Clark Mobarry:
+//
+// sort the matches in X
+// apply the mask to the X
+// sort the matchs in Y
+// apply the mask to the Y
+// output the matches
+//
+// if we keep the coverage intervals in core, we get around sorting
+// the matches. how big can they be -- especially if we only keep
+// things with > 1 coverage! But, we also can't use an elegant
+// algorithm for trimming/splitting.
+
+
+// We can abuse this to subtract matches from other matches. The
+// operation done for removing non-unique is to first find any
+// overlapping intervals in the set, then subract those from the
+// input. If we instead just find all intervals in a set of matches,
+// and subtract those from the input, we get subtraction.
+
+
+// Reads the input, builds an interval list of the regions
+// that have coverage > 1.
+//
+struct coverage1_s {
+ uint32 axis;
+ uint32 position;
+ int increment;
+};
+
+struct coverage2_s {
+ uint32 axis;
+ uint32 beg;
+ uint32 end;
+ uint32 coverage;
+};
+
+struct match_s {
+ uint32 iid1, pos1, len1, ori1;
+ uint32 iid2, pos2, len2, ori2;
+};
+
+int
+sortCoverage1(const void *a, const void *b) {
+ const coverage1_s *A = *((const coverage1_s * const *)a);
+ const coverage1_s *B = *((const coverage1_s * const *)b);
+
+ if (A->axis < B->axis) return(-1);
+ if (A->axis > B->axis) return(1);
+ if (A->position < B->position) return(-1);
+ if (A->position > B->position) return(1);
+ if (A->increment > B->increment) return(-1);
+ if (A->increment < B->increment) return(1);
+ return(0);
+}
+
+
+// Not a complete comparison, but we only use this for an interval
+// list.
+//
+int
+sortCoverage2(const void *a, const void *b) {
+ const coverage2_s *A = *((const coverage2_s * const *)a);
+ const coverage2_s *B = *((const coverage2_s * const *)b);
+
+ if (A->axis < B->axis) return(-1);
+ if (A->axis > B->axis) return(1);
+ if (A->beg < B->beg) return(-1);
+ if (A->beg > B->beg) return(1);
+ return(0);
+}
+
+
+// Same as sortCoverage2, but the array being sorted is not
+// an array of pointers.
+//
+int
+sortCoverage3(const void *a, const void *b) {
+ const coverage2_s *A = (const coverage2_s *)a;
+ const coverage2_s *B = (const coverage2_s *)b;
+
+ if (A->axis < B->axis) return(-1);
+ if (A->axis > B->axis) return(1);
+ if (A->beg < B->beg) return(-1);
+ if (A->beg > B->beg) return(1);
+ return(0);
+}
+
+
+
+
+// An interval list, searchable
+//
+class coverageIntervals {
+public:
+ dict_t *_il;
+ dict_load_t _load;
+public:
+ coverageIntervals() {
+ _il = dict_create(DICTCOUNT_T_MAX, sortCoverage2);
+ };
+
+ ~coverageIntervals() {
+ dict_free(_il);
+ pfree();
+ };
+
+
+ // We want to return the first node that is before our thing. Our comparison
+ // tests the start position. If there is no first node before our thing,
+ // return the next node.
+ //
+ dnode_t *lookup(void *thing) {
+ dnode_t *it = dict_upper_bound(_il, thing);
+ if (it == 0L)
+ it = dict_lower_bound(_il, thing);
+ return(it);
+ };
+
+
+ void addInterval(int axis, int beg, int end, int coverage) {
+ dnode_t *node = (dnode_t *)palloc(sizeof(dnode_t));
+ coverage2_s *cov = (coverage2_s *)palloc(sizeof(coverage2_s));
+
+ cov->axis = axis;
+ cov->beg = beg;
+ cov->end = end;
+ cov->coverage = coverage;
+
+ // initialize the node with the value
+ dnode_init(node, 0L);
+
+ // insert the node into the tree using the key
+ dict_insert(_il, node, (void *)cov);
+ };
+
+ void beginLoad(void) {
+ dict_load_begin(&_load, _il);
+ };
+ void endLoad(void) {
+ dict_load_end(&_load);
+ };
+ void loadInterval(int axis, int beg, int end, int coverage) {
+ dnode_t *node = (dnode_t *)palloc(sizeof(dnode_t));
+ coverage2_s *cov = (coverage2_s *)palloc(sizeof(coverage2_s));
+
+ cov->axis = axis;
+ cov->beg = beg;
+ cov->end = end;
+ cov->coverage = coverage;
+
+ // initialize the node with the value
+ dnode_init(node, 0L);
+
+ // insert the node into the tree using the key
+ dict_load_next(&_load, node, (void *)cov);
+ };
+};
+
+
+
+
+void
+offsetsToCoverage(uint32 minCov, bigQueue *I, coverageIntervals *L) {
+ uint32 axis = ~uint32ZERO;
+ uint32 position = ~uint32ZERO;
+ uint32 coverage = 0;
+ uint64 covered = 0;
+
+
+ L->beginLoad();
+ speedCounter D(" %8.0f matches treed (%8.2f matches/sec)\r", 1, 511, false);
+ while (I->next()) {
+ coverage1_s *cov1 = (coverage1_s *)I->get();
+
+ if ((cov1->axis != axis) && (coverage != 0))
+ fprintf(stderr, "Sorting error -- have coverage at the end of an axis.\n"), exit(1);
+
+ int length = cov1->position - position;
+
+ if ((coverage >= minCov) && (length > 0)) {
+ D.tick();
+ L->loadInterval(axis, position, position+length, coverage);
+ covered += length;
+ }
+
+ // Occasionally, we get stung by insisting to use unsigned
+ // numbers. This is one of them.
+ //
+ if ((coverage == 0) && (cov1->increment == -1))
+ fprintf(stderr, "Sorting error -- have negative coverage (axis="uint32FMT" position="uint32FMT")!\n",
+ axis, position), exit(1);
+
+ coverage += cov1->increment;
+ axis = cov1->axis;
+ position = cov1->position;
+ }
+ D.finish();
+ L->endLoad();
+
+ fprintf(stderr, "offsetsToCoverage()-- Found "uint64FMT" bases at coverage "uint32FMT" or greater.\n",
+ covered, minCov);
+}
+
+
+
+
+
+void
+findCoverageIntervals(char const *fileName,
+ uint32 minCov,
+ coverageIntervals *Fint,
+ coverageIntervals *Rint) {
+ bigQueue F(sortCoverage1, 0L, 0L, 0L, sizeof(coverage1_s), 128, 0L);
+ bigQueue R(sortCoverage1, 0L, 0L, 0L, sizeof(coverage1_s), 128, 0L);
+
+ //
+ // Read the input file, building a bigQueue of the interval offsets
+ //
+
+
+ atacFileStream AF(fileName);
+ atacMatch *m = AF.nextMatch('u');
+
+ while (m) {
+ coverage1_s *fbeg = (coverage1_s *)malloc(sizeof(coverage1_s));
+ coverage1_s *fend = (coverage1_s *)malloc(sizeof(coverage1_s));
+ coverage1_s *rbeg = (coverage1_s *)malloc(sizeof(coverage1_s));
+ coverage1_s *rend = (coverage1_s *)malloc(sizeof(coverage1_s));
+
+ fbeg->axis = m->iid1;
+ fbeg->position = m->pos1;
+ fbeg->increment = 1;
+
+ fend->axis = m->iid1;
+ fend->position = m->pos1 + m->len1;
+ fend->increment = -1;
+
+ rbeg->axis = m->iid2;
+ rbeg->position = m->pos2;
+ rbeg->increment = 1;
+
+ rend->axis = m->iid2;
+ rend->position = m->pos2 + m->len2;
+ rend->increment = -1;
+
+ F.add(fbeg);
+ F.add(fend);
+ R.add(rbeg);
+ R.add(rend);
+
+ m = AF.nextMatch('u');
+ }
+
+ // Sort each bigQueue
+ //
+ F.sort();
+ R.sort();
+
+ // Convert the interval offsets into a coverage interval list
+ //
+ offsetsToCoverage(minCov, &F, Fint);
+ offsetsToCoverage(minCov, &R, Rint);
+}
+
+
+
+
+
+
+
+
+
+
+void
+intersectTest(match_s *matches,
+ uint32 matchesLen,
+ coverageIntervals *Fint,
+ coverageIntervals *Rint,
+ uint32 matchNumber) {
+ bool errors = false;
+
+ for (uint32 i=0; i<matchesLen; i++) {
+ coverage2_s thing;
+
+ // Query the tree for the first interval intersecting iid1
+ //
+ thing.axis = matches[i].iid1;
+ thing.beg = matches[i].pos1;
+ thing.end = matches[i].pos1 + matches[i].len1;
+ thing.coverage = 0;
+ dnode_t *node1 = Fint->lookup(&thing);
+
+ thing.axis = matches[i].iid2;
+ thing.beg = matches[i].pos2;
+ thing.end = matches[i].pos2 + matches[i].len2;
+ thing.coverage = 0;
+ dnode_t *node2 = Rint->lookup(&thing);
+
+
+ // Keep iterating until the node returned from the tree
+ // is empty, or it is after our region
+ //
+ while (node1 && node2) {
+ const coverage2_s *key1 = 0L, *key2 = 0L;
+
+ bool isect1=false, before1=false;
+ bool isect2=false, before2=false;
+
+ if (node1) {
+ key1 = (const coverage2_s *)dnode_getkey(node1);
+
+ isect1 = ((key1->axis == matches[i].iid1) &&
+ (matches[i].pos1 < key1->end) &&
+ (key1->beg < matches[i].pos1 + matches[i].len1));
+
+ before1 = ((key1->axis < matches[i].iid1) ||
+ ((key1->axis == matches[i].iid1) && (key1->beg < matches[i].pos1 + matches[i].len1)));
+ }
+
+
+ if (node2) {
+ key2 = (const coverage2_s *)dnode_getkey(node2);
+
+ isect2 = ((key2->axis == matches[i].iid2) &&
+ (matches[i].pos2 < key2->end) &&
+ (key2->beg < matches[i].pos2 + matches[i].len2));
+
+ before2 = ((key2->axis < matches[i].iid2) ||
+ ((key2->axis == matches[i].iid2) && (key2->beg < matches[i].pos2 + matches[i].len2)));
+ }
+
+
+ if (isect1) {
+ fprintf(stderr, "Got fwd intersection on i="uint32FMT" matchNumber="uint32FMT"\n", i, matchNumber);
+ fprintf(stdout, "--"uint32FMT" "uint32FMT" 1 "uint32FMT" "uint32FMT" %d\n",
+ matches[i].pos1, matches[i].pos1 + matches[i].len1,
+ matches[i].pos2, matches[i].pos2 + matches[i].len2, matches[i].ori2 ? 1 : -1);
+ fprintf(stdout, "--key1 beg="uint32FMT" end="uint32FMT"\n",
+ key1->beg, key1->end);
+ errors = true;
+ }
+ if (isect2) {
+ fprintf(stderr, "Got rev intersection on i="uint32FMT" matchNumber="uint32FMT"\n", i, matchNumber);
+ fprintf(stdout, "--"uint32FMT" "uint32FMT" 1 "uint32FMT" "uint32FMT" %d\n",
+ matches[i].pos1, matches[i].pos1 + matches[i].len1,
+ matches[i].pos2, matches[i].pos2 + matches[i].len2, matches[i].ori2 ? 1 : -1);
+ fprintf(stdout, "--key2 beg="uint32FMT" end="uint32FMT"\n",
+ key2->beg, key2->end);
+ errors = true;
+ }
+
+
+ // If we intersected or were before, move to the next, otherwise,
+ // stop
+ //
+ if (isect1 || before1)
+ node1 = dict_next(Fint->_il, node1);
+ else
+ node1 = 0L;
+
+ if (isect2 || before2)
+ node2 = dict_next(Rint->_il, node2);
+ else
+ node2 = 0L;
+ }
+ }
+
+ if (errors)
+ abort();
+}
+
+
+
+
+
+
+
+
+
+// This is used all over the place.
+//
+#define D08D2 uint32FMTW(8)" "uint32FMTW(8)
+#define KEY1THING "key1 = "uint32FMTW(8)" "uint32FMTW(8)" "uint32FMTW(8)" thing = "D08D2" "D08D2"\n"
+#define KEY2THING "key2 = "uint32FMTW(8)" "uint32FMTW(8)" "uint32FMTW(8)" thing = "D08D2" "D08D2"\n"
+
+int
+main(int argc, char **argv) {
+ char *inputName = 0L;
+ char *subtractName = 0L;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-i") == 0) {
+ inputName = argv[++arg];
+ } else if (strcmp(argv[arg], "-s") == 0) {
+ subtractName = argv[++arg];
+ } else {
+ fprintf(stderr, "usage: %s [-h] [-s subtractFile] [-i inputFile]\n", argv[0]);
+ fprintf(stderr, " -s instead of finding regions to remove by looking\n");
+ fprintf(stderr, " for duplicatd regions in inputFile, load them\n");
+ fprintf(stderr, " from subtractFile.\n");
+ exit(1);
+ }
+ arg++;
+ }
+
+ if (inputName == 0L)
+ fprintf(stderr, "usage: %s [-i inputfile] [-o outputfile] [-h]\n", argv[0]), exit(1);
+
+ coverageIntervals *Fint = new coverageIntervals;
+ coverageIntervals *Rint = new coverageIntervals;
+
+ if (subtractName)
+ findCoverageIntervals(subtractName, 1, Fint, Rint);
+ else
+ findCoverageIntervals(inputName, 2, Fint, Rint);
+
+
+
+ // The original implementation would then sort the matches in X,
+ // merge the sorted intervals and the sorted matches together, resort
+ // the matches by Y, and merge with Rcov. That is a lot of work to
+ // avoid keeping two interval lists in memory.
+ //
+ // We build an in-core interval list for both assemblies, and
+ // then stream the matches by it.
+
+ // we need to ask the interval list:
+ // return the intervals that are covered by this interval
+
+
+ uint32 matchesLen = 0;
+ uint32 matchesMax = 1024;
+ match_s *matches = new match_s [matchesMax];
+ match_s extent;
+
+ uint32 matchNumber = 0;
+
+ atacFileStream AF(inputName);
+ atacMatch *m = AF.nextMatch('u');
+
+ while (m) {
+ matches[0].iid1 = m->iid1;
+ matches[0].pos1 = m->pos1;
+ matches[0].len1 = m->len1;
+ matches[0].ori1 = m->fwd1;
+ matches[0].iid2 = m->iid2;
+ matches[0].pos2 = m->pos2;
+ matches[0].len2 = m->len2;
+ matches[0].ori2 = m->fwd2;
+
+ // Save the original, we'll use this to test for intersections.
+ //
+ memcpy(&extent, matches, sizeof(match_s));
+
+ bool fwd = (matches[0].ori1 == matches[0].ori2);
+
+
+ // Query the tree for the first interval intersecting iid1
+ // XXX: Should this be upper_bound instead?
+ //
+ // A scratch interval used for querying the list
+ //
+ coverage2_s thing;
+
+ thing.axis = extent.iid1;
+ thing.beg = extent.pos1;
+ thing.end = extent.pos1 + extent.len1;
+ thing.coverage = 0;
+ dnode_t *node1start = Fint->lookup(&thing);
+ dnode_t *node1 = node1start;
+
+
+ thing.axis = extent.iid2;
+ thing.beg = extent.pos2;
+ thing.end = extent.pos2 + extent.len2;
+ thing.coverage = 0;
+ dnode_t *node2start = Rint->lookup(&thing);
+ dnode_t *node2 = node2start;
+
+ // while the node intersects the match, trim or split it, then
+ // get the next node.
+ //
+ // any way I tried this, it's ugly.
+ //
+ // if there is one match, then it is in [0]. If there is more
+ // than one match, then the trimmed match is in [0], but the
+ // split matches are in [1] on.
+
+
+ // XXX the problem is that we split off things, then move
+ // to the next match, without checking previously split
+ // things against this match
+ //
+
+ // Keep iterating until the node returned from the tree
+ // is empty, or it is after our region
+ //
+ while (node1 || node2) {
+
+ bool before1=false;
+ bool before2=false;
+
+ bool modified=false;
+
+ const coverage2_s *key1 = 0L;
+ const coverage2_s *key2 = 0L;
+
+ if (node1) {
+ key1 = (const coverage2_s *)dnode_getkey(node1);
+ before1 = ((key1->axis < extent.iid1) ||
+ ((key1->axis == extent.iid1) && (key1->beg < extent.pos1 + extent.len1)));
+
+
+ // Three cases: (1) we trim off the front, (2) trim off the
+ // back or (3) split. And, (4) delete the whole damn thing.
+ //
+ // Further complicated by having multiple things to try.
+ //
+ // If anything is modified, reset the node to the start
+ //
+
+ for (uint32 i=0; i<matchesLen; i++) {
+ if (matches[i].len1 == 0)
+ continue;
+
+ if ((key1->beg <= matches[i].pos1) &&
+ ((matches[i].pos1 + matches[i].len1) <= key1->end)) {
+ modified = true;
+
+ // Trim the whole thing?
+ //
+ matches[i].pos1 = 0;
+ matches[i].len1 = 0;
+ matches[i].pos2 = 0;
+ matches[i].len2 = 0;
+ } else if ((matches[i].pos1 < key1->beg) &&
+ (key1->end < (matches[i].pos1 + matches[i].len1))) {
+ modified = true;
+
+ // Contained. Split it.
+ //
+
+ // The left half
+ //
+ int newLen = key1->beg - matches[i].pos1;
+
+ matches[matchesLen].iid1 = matches[i].iid1;
+ matches[matchesLen].pos1 = matches[i].pos1;
+ matches[matchesLen].len1 = newLen;
+ matches[matchesLen].ori1 = matches[i].ori1;
+
+ if (fwd) {
+ matches[matchesLen].iid2 = matches[i].iid2;
+ matches[matchesLen].pos2 = matches[i].pos2;
+ matches[matchesLen].len2 = newLen;
+ matches[matchesLen].ori2 = matches[i].ori2;
+ } else {
+ matches[matchesLen].iid2 = matches[i].iid2;
+ matches[matchesLen].pos2 = matches[i].pos2 + matches[i].len2 - newLen;
+ matches[matchesLen].len2 = newLen;
+ matches[matchesLen].ori2 = matches[i].ori2;
+ }
+
+ matchesLen++;
+
+ // The right half
+ //
+ newLen = matches[i].pos1 + matches[i].len1 - key1->end;
+
+ matches[matchesLen].iid1 = matches[i].iid1;
+ matches[matchesLen].pos1 = key1->end;
+ matches[matchesLen].len1 = newLen;
+ matches[matchesLen].ori1 = matches[i].ori1;
+
+ if (fwd) {
+ matches[matchesLen].iid2 = matches[i].iid2;
+ matches[matchesLen].pos2 = matches[i].pos2 + (key1->end - matches[i].pos1);
+ matches[matchesLen].len2 = newLen;
+ matches[matchesLen].ori2 = matches[i].ori2;
+ } else {
+ matches[matchesLen].iid2 = matches[i].iid2;
+ matches[matchesLen].pos2 = matches[i].pos2;
+ matches[matchesLen].len2 = newLen;
+ matches[matchesLen].ori2 = matches[i].ori2;
+ }
+
+ matchesLen++;
+
+ // Invalidate this match
+ //
+ matches[i].pos1 = 0;
+ matches[i].len1 = 0;
+ matches[i].pos2 = 0;
+ matches[i].len2 = 0;
+ } else if ((key1->beg <= matches[i].pos1) &&
+ (matches[i].pos1 < key1->end)) {
+ modified = true;
+
+ // Trim the begin?
+ //
+
+ int trimLen = key1->end - matches[i].pos1;
+ matches[i].pos1 += trimLen;
+ matches[i].len1 -= trimLen;
+
+ if (fwd == true)
+ matches[i].pos2 += trimLen;
+ matches[i].len2 -= trimLen;
+
+ } else if ((key1->beg < (matches[i].pos1 + matches[i].len1)) &&
+ ((matches[i].pos1 + matches[i].len1) <= key1->end)) {
+ modified = true;
+
+ // Trim the end?
+ //
+
+ int trimLen = matches[i].pos1 + matches[i].len1 - key1->beg;
+ matches[i].len1 -= trimLen;
+
+ if (fwd == false)
+ matches[i].pos2 += trimLen;
+ matches[i].len2 -= trimLen;
+ }
+ }
+ } // isect
+
+
+
+ if (node2) {
+ key2 = (const coverage2_s *)dnode_getkey(node2);
+ before2 = ((key2->axis < extent.iid2) ||
+ ((key2->axis == extent.iid2) && (key2->beg < extent.pos2 + extent.len2)));
+
+ for (uint32 i=0; i<matchesLen; i++) {
+ if (matches[i].len2 == 0)
+ continue;
+
+ if ((key2->beg <= matches[i].pos2) &&
+ ((matches[i].pos2 + matches[i].len2) <= key2->end)) {
+ modified = true;
+
+ // Trim the whole thing?
+ //
+ matches[i].pos1 = 0;
+ matches[i].len1 = 0;
+ matches[i].pos2 = 0;
+ matches[i].len2 = 0;
+ } else if ((matches[i].pos2 < key2->beg) &&
+ (key2->end < (matches[i].pos2 + matches[i].len2))) {
+ modified = true;
+
+ // Contained. Split it.
+ //
+
+ // The left (forward strand) half
+ //
+
+ if (fwd) {
+ int newLen = key2->beg - matches[i].pos2;
+
+ matches[matchesLen].iid1 = matches[i].iid1;
+ matches[matchesLen].pos1 = matches[i].pos1;
+ matches[matchesLen].len1 = newLen;
+ matches[matchesLen].ori1 = matches[i].ori1;
+
+ matches[matchesLen].iid2 = matches[i].iid2;
+ matches[matchesLen].pos2 = matches[i].pos2;
+ matches[matchesLen].len2 = newLen;
+ matches[matchesLen].ori2 = matches[i].ori2;
+ } else {
+ int newLen = matches[i].pos2 + matches[i].len2 - key2->end;
+
+ matches[matchesLen].iid1 = matches[i].iid1;
+ matches[matchesLen].pos1 = matches[i].pos1;
+ matches[matchesLen].len1 = newLen;
+ matches[matchesLen].ori1 = matches[i].ori1;
+
+ matches[matchesLen].iid2 = matches[i].iid2;
+ matches[matchesLen].pos2 = key2->end;
+ matches[matchesLen].len2 = newLen;
+ matches[matchesLen].ori2 = matches[i].ori2;
+ }
+
+ matchesLen++;
+
+ // The right (forward strand) half
+ //
+ if (fwd) {
+ int newLen = matches[i].pos2 + matches[i].len2 - key2->end;
+
+ matches[matchesLen].iid1 = matches[i].iid1;
+ matches[matchesLen].pos1 = matches[i].pos1 + key2->end - matches[i].pos2;
+ matches[matchesLen].len1 = newLen;
+ matches[matchesLen].ori1 = matches[i].ori1;
+
+ matches[matchesLen].iid2 = matches[i].iid2;
+ matches[matchesLen].pos2 = key2->end;
+ matches[matchesLen].len2 = newLen;
+ matches[matchesLen].ori2 = matches[i].ori2;
+ } else {
+ int newLen = key2->beg - matches[i].pos2;
+
+ matches[matchesLen].iid1 = matches[i].iid1;
+ matches[matchesLen].pos1 = matches[i].pos1 + matches[i].pos2 + matches[i].len2 - key2->beg;
+ matches[matchesLen].len1 = newLen;
+ matches[matchesLen].ori1 = matches[i].ori1;
+
+ matches[matchesLen].iid2 = matches[i].iid2;
+ matches[matchesLen].pos2 = matches[i].pos2;
+ matches[matchesLen].len2 = newLen;
+ matches[matchesLen].ori2 = matches[i].ori2;
+ }
+
+ matchesLen++;
+
+ // Invalidate this match
+ //
+ matches[i].pos1 = 0;
+ matches[i].len1 = 0;
+ matches[i].pos2 = 0;
+ matches[i].len2 = 0;
+ } else if ((key2->beg <= matches[i].pos2) &&
+ (matches[i].pos2 < key2->end)) {
+ modified = true;
+
+ // Trim the begin? fwdOK, revOK
+ //
+
+ int trimLen = key2->end - matches[i].pos2;
+ matches[i].pos2 += trimLen;
+ matches[i].len2 -= trimLen;
+
+ if (fwd == true)
+ matches[i].pos1 += trimLen;
+ matches[i].len1 -= trimLen;
+
+ } else if ((key2->beg < (matches[i].pos2 + matches[i].len2)) &&
+ ((matches[i].pos2 + matches[i].len2) <= key2->end)) {
+ modified = true;
+
+ // Trim the end?
+ //
+
+ int trimLen = matches[i].pos2 + matches[i].len2 - key2->beg;
+ matches[i].len1 -= trimLen;
+
+ if (fwd == false)
+ matches[i].pos1 += trimLen;
+ matches[i].len2 -= trimLen;
+ }
+ }
+ }
+
+ // If we intersected or were before, move to the next, otherwise,
+ // stop.
+ //
+ if (modified)
+ node1 = node1start;
+ else if (before1)
+ node1 = dict_next(Fint->_il, node1);
+ else
+ node1 = 0L;
+
+ if (modified)
+ node2 = node2start;
+ else if (before2)
+ node2 = dict_next(Rint->_il, node2);
+ else
+ node2 = 0L;
+ } // end of while (node1 || node2)
+
+
+ // Nobody should be outside the extent
+ //
+ for (uint32 i=0; i<matchesLen; i++) {
+ if ((matches[i].len1 > 0) && (matches[i].len2 > 0)) {
+ if ((matches[i].pos1 < extent.pos1) ||
+ (matches[i].pos1 + matches[i].len1 > extent.pos1 + extent.len1) ||
+ (matches[i].pos2 < extent.pos2) ||
+ (matches[i].pos2 + matches[i].len2 > extent.pos2 + extent.len2)) {
+ fprintf(stderr, "match "uint32FMT" is outside the extent!\n", i);
+ abort();
+ }
+ }
+ }
+
+ // Print out all the modified matches
+ //
+ for (uint32 i=0; i<matchesLen; i++) {
+ if ((matches[i].len1 > 0) && (matches[i].len2 > 0)) {
+ fprintf(stdout, "M %s %s."uint32FMT" . %s "uint32FMT" "uint32FMT" 1 %s "uint32FMT" "uint32FMT" %d\n",
+ m->matchuid, m->parentuid, i,
+ AF.labelA(), matches[i].pos1, matches[i].len1,
+ AF.labelB(), matches[i].pos2, matches[i].len2, matches[i].ori2 ? 1 : -1);
+ }
+ }
+
+ // Check that the modified matches do not intersect anything in
+ // the tree.
+ //
+ intersectTest(matches, matchesLen, Fint, Rint, matchNumber);
+
+ matchNumber++;
+ }
+}
diff --git a/configure.sh b/configure.sh
new file mode 100755
index 0000000..b27fd8a
--- /dev/null
+++ b/configure.sh
@@ -0,0 +1,382 @@
+#!/bin/sh
+
+# Set up the build system -- need some symlinks to the build
+# directory.
+#
+if [ ! -e Makefile ] ; then
+ if [ -e ../build/Makefile ] ; then
+ ln -s ../build/Make.rules .
+ ln -s ../build/Makefile .
+ elif [ -e build/Makefile ] ; then
+ ln -s build/Make.rules .
+ ln -s build/Makefile .
+ else
+ echo "ERROR: Couldn't find the Makefile!"
+ exit 1
+ fi
+fi
+
+
+# If no target, try to figure out one based on uname. This defaults to
+# the optimized target below. If it works well, we can always use this
+# mechanism, and extend with "debug" or "profile" (e.g., "./configure.sh debug")
+#
+target=$1
+
+if [ "x$target" = "xdebug" ] ; then
+ opts="-debug";
+ target=""
+fi
+if [ "x$target" = "xprofile" ] ; then
+ opts="-profile";
+ target=""
+fi
+
+if [ "x$target" = "x" ] ; then
+ case `uname` in
+ Darwin)
+ target="Darwin-i386$opts"
+ if [ "`uname -m`" = "Power Macintosh" ] ; then
+ target="Darwin-ppc$opts"
+ fi
+ if [ `uname -m` = "x86_64" ] ; then
+ target="Darwin-amd64$opts"
+ fi
+ ;;
+ FreeBSD)
+ target="FreeBSD-i386$opts"
+ if [ `uname -m` = "amd64" ] ; then
+ target="FreeBSD-amd64$opts"
+ fi
+ ;;
+ AIX)
+ target="AIX$opts"
+ ;;
+ OSF1)
+ target="OSF1$opts"
+ ;;
+ Linux)
+ target="Linux-i686$opts"
+ if [ `uname -m` = "x86_64" ] ; then
+ target="Linux-amd64$opts"
+ fi
+ if [ `uname -m` = "ia64" ] ; then
+ target="Linux-ia64$opts"
+ fi
+ ;;
+ SunOS)
+ target="solaris$opts"
+ ;;
+ *)
+ echo "ERROR: Unknown uname of `uname` -- try manual configuration."
+ exit 1
+ ;;
+ esac
+fi
+
+
+#
+# Look for the python headers. We don't need the libraries. This is
+# used by atac-driver/chainer only.
+#
+
+PYTHON=${PYTHON:-`which python`}
+
+if [ ! -x $PYTHON ] ; then
+ echo "WARNING: Python program not found at '$PYTHON'. Try setting environment variable PYTHON to the location of the python interpreter."
+ WITHOUT_ATAC="atac-driver/ seatac/"
+else
+ echo "Python executable found in '$PYTHON'"
+ CFLAGS_PYTHON=`$PYTHON -c "from distutils import sysconfig; print sysconfig.get_python_inc()"`
+
+ if [ -z "$CFLAGS_PYTHON" -o ! -d "$CFLAGS_PYTHON" ] ; then
+ echo "WARNING: Python development environment not found."
+ WITHOUT_ATAC="atac-driver/ seatac/"
+ else
+ echo "Python libraries found in '$CFLAGS_PYTHON'"
+ fi
+fi
+
+if [ ! -z "$WITHOUT_ATAC" ] ; then
+ echo "WARNING: Will not build ATAC."
+fi
+
+
+#
+# Decide on compilers to use. Unfortunately, all the options are tuned for gcc/g++.
+# In particular, -m64 and -W* and -f* aren't liked by Intel compilers.
+#
+
+if [ x$CC = x ] ; then
+ CC="gcc"
+fi
+
+if [ x$CXX = x ] ; then
+ CXX="g++"
+fi
+
+#
+# Emit architecture specific configurations.
+#
+
+case $target in
+ Darwin-i386|Darwin-amd64)
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# OS-X, optimized
+#
+CC := $CC
+SHLIB_FLAGS := -dynamiclib -undefined dynamic_lookup
+CFLAGS_COMPILE := -Ofast -fPIC -m64 -fmessage-length=0 -D_REENTRANT -D_THREAD_SAFE -Wall -Wno-char-subscripts
+CLDFLAGS := -m64
+CLIBS :=
+CXX := $CXX
+CXXFLAGS_COMPILE := -Ofast -fPIC -m64 -fmessage-length=0 -D_REENTRANT -D_THREAD_SAFE -Wall -Wno-char-subscripts
+CXXLDFLAGS := -m64
+CXXLIBS :=
+LDFLAGS_PYTHON := -bundle -framework CoreFoundation -framework Python -dynamic
+ARFLAGS := ruvs
+INSTALL/ := $target/
+EOF
+ ;;
+ Darwin-i386-debug|Darwin-amd64-debug)
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# OS-X, debug
+#
+CC := $CC
+SHLIB_FLAGS := -dynamiclib -undefined dynamic_lookup
+CFLAGS_COMPILE := -g3 -m64 -fmessage-length=0 -D_REENTRANT -D_THREAD_SAFE -Wall -Wno-char-subscripts
+CLDFLAGS := -m64
+CLIBS :=
+CXX := $CXX
+CXXFLAGS_COMPILE := -g3 -m64 -fmessage-length=0 -D_REENTRANT -D_THREAD_SAFE -Wall -Wno-char-subscripts
+CXXLDFLAGS := -m64
+CXXLIBS :=
+LDFLAGS_PYTHON := -bundle -framework CoreFoundation -framework Python -dynamic
+ARFLAGS := ruvs
+INSTALL/ := $target/
+EOF
+ ;;
+ FreeBSD-amd64)
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# FreeBSD, optimized
+CC := $CC
+SHLIB_FLAGS := -shared
+CFLAGS_COMPILE := -O3 -fPIC -pthread -D_REENTRANT -Wall -Wno-char-subscripts -mtune=native -march=native -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CLDFLAGS := -L/usr/local/lib
+CLIBS := -pthread -lthr
+CXX := $CXX
+CXXFLAGS_COMPILE := -O3 -fPIC -pthread -D_REENTRANT -Wall -Wno-char-subscripts -mtune=native -march=native -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CXXLDFLAGS := -L/usr/local/lib
+CXXLIBS := -pthread -lthr
+ARFLAGS := ruvs
+INSTALL/ := $target/
+EOF
+ ;;
+ FreeBSD-amd64-debug)
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# FreeBSD, debug, warnings
+CC := $CC
+SHLIB_FLAGS := -shared
+CFLAGS_COMPILE := -g -pthread -D_REENTRANT -fPIC -Wall -Wno-char-subscripts -Wshadow -Wpointer-arith -Wcast-qual -Wcast-align -Wwrite-strings -Wconversion -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations -Wnested-externs
+CLDFLAGS := -L/usr/local/lib
+CLIBS := -pthread -lthr
+CXX := $CXX
+CXXFLAGS_COMPILE := -g -pthread -D_REENTRANT -fPIC -Wall -Wno-char-subscripts -Wshadow -Wpointer-arith -Wcast-qual -Wcast-align -Wwrite-strings -Wconversion
+CXXLDFLAGS := -L/usr/local/lib
+CXXLIBS := -pthread -lthr
+ARFLAGS := ruvs
+INSTALL/ := $target/
+EOF
+ ;;
+ FreeBSD-amd64-profile)
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# FreeBSD, debug, warnings
+CC := $CC
+SHLIB_FLAGS := -shared
+CFLAGS_COMPILE := -pg -O3 -pthread -D_REENTRANT -fPIC -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions
+CLDFLAGS := -pg -L/usr/local/lib
+CLIBS := -pthread -lthr
+CXX := $CXX
+CXXFLAGS_COMPILE := -pg -O3 -pthread -D_REENTRANT -fPIC -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions
+CXXLDFLAGS := -pg -L/usr/local/lib
+CXXLIBS := -pthread -lthr
+ARFLAGS := ruvs
+INSTALL/ := $target/
+EOF
+ ;;
+ Linux-i686)
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# Linux, optimized
+CC := $CC
+SHLIB_FLAGS := -shared
+CFLAGS_COMPILE := -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_REENTRANT -O3 -D_THREAD_SAFE -pthread -fmessage-length=0 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CLDFLAGS := -L/usr/local/lib
+CLIBS := -pthread -ldl
+CXX := $CXX
+CXXFLAGS_COMPILE := -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_REENTRANT -O3 -D_THREAD_SAFE -pthread -fmessage-length=0 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CXXLDFLAGS := -L/usr/local/lib
+CXXLIBS := -pthread -ldl
+ARFLAGS := ruvs
+INSTALL/ := $target/
+EOF
+ ;;
+ Linux-amd64)
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# Linux64, optimized
+CC := $CC
+SHLIB_FLAGS := -shared
+CFLAGS_COMPILE := -m64 -fPIC -D_REENTRANT -O3 -D_THREAD_SAFE -pthread -fmessage-length=0 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CLDFLAGS := -L/usr/local/lib
+CLIBS := -pthread -ldl
+CXX := $CXX
+CXXFLAGS_COMPILE := -m64 -fPIC -D_REENTRANT -O3 -D_THREAD_SAFE -pthread -fmessage-length=0 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CXXLDFLAGS := -L/usr/local/lib
+CXXLIBS := -pthread -ldl
+ARFLAGS := ruvs
+INSTALL/ := $target/
+EOF
+ ;;
+ Linux-amd64-debug)
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# Linux64, optimized
+CC := $CC
+SHLIB_FLAGS := -shared
+CFLAGS_COMPILE := -m64 -fPIC -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_REENTRANT -g -D_THREAD_SAFE -pthread -fmessage-length=0 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CLDFLAGS := -L/usr/local/lib
+CLIBS := -pthread -ldl
+CXX := $CXX
+CXXFLAGS_COMPILE := -m64 -fPIC -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_REENTRANT -g -D_THREAD_SAFE -pthread -fmessage-length=0 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CXXLDFLAGS := -L/usr/local/lib
+CXXLIBS := -pthread -ldl
+ARFLAGS := ruvs
+INSTALL/ := $target/
+EOF
+ ;;
+ Linux-amd64-profile)
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# Linux64, optimized
+CC := $CC
+SHLIB_FLAGS := -shared
+CFLAGS_COMPILE := -pg -m64 -fPIC -D_REENTRANT -O3 -D_THREAD_SAFE -pthread -fmessage-length=0 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions
+CLDFLAGS := -L/usr/local/lib
+CLIBS := -pthread -ldl
+CXX := $CXX
+CXXFLAGS_COMPILE := -pg -m64 -fPIC -D_REENTRANT -O3 -D_THREAD_SAFE -pthread -fmessage-length=0 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions
+CXXLDFLAGS := -L/usr/local/lib
+CXXLIBS := -pthread -ldl
+ARFLAGS := ruvs
+INSTALL/ := $target/
+EOF
+ ;;
+ Linux-ia64)
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# Linux64, optimized
+CC := $CC
+SHLIB_FLAGS := -shared
+CFLAGS_COMPILE := -m64 -fPIC -D_REENTRANT -O3 -D_THREAD_SAFE -pthread -fmessage-length=0 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CLDFLAGS := -L/usr/local/lib
+CLIBS := -pthread -ldl
+CXX := $CXX
+CXXFLAGS_COMPILE := -m64 -fPIC -D_REENTRANT -O3 -D_THREAD_SAFE -pthread -fmessage-length=0 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CXXLDFLAGS := -L/usr/local/lib
+CXXLIBS := -pthread -ldl
+ARFLAGS := ruvs
+INSTALL/ := $target/
+EOF
+ ;;
+
+
+
+ # SUNLF needs to be set to allow for large file support on Solaris. It
+ # should be whatever the following getconf's say.
+ #
+ # getconf LFS_CFLAGS = "-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64"
+ # getconf LFS_LDFLAGS = ""
+ # getconf LFS_LFS_LIBS = ""
+ #
+ solaris)
+ echo "Solaris is UNTESTED!"
+ rm -f Make.compilers
+ cat <<EOF > Make.compilers
+# -*- makefile -*-
+# Solaris, gcc optimized
+#
+CC := $CC -m64
+SHLIB_FLAGS := -G #untested
+CFLAGS_COMPILE := -D_REENTRANT -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -O3 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CLDFLAGS :=
+CLIBS := -lpthread -lrt
+CXX := $CXX -m64
+CXXFLAGS_COMPILE := -D_REENTRANT -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -O3 -Wall -Wno-char-subscripts -funroll-loops -fexpensive-optimizations -finline-functions -fomit-frame-pointer
+CXXLDFLAGS :=
+CXXLIBS := -lpthread -lrt
+ARFLAGS := ruv
+INSTALL/ := $target/
+EOF
+ ;;
+ *)
+ echo "usage: $0 <configuration>"
+ echo " osx OS-X, optimized"
+ echo " osx-debug OS-X, debug"
+ echo ""
+ echo " freebsd FreeBSD, optimized"
+ echo " freebsd-profile FreeBSD, optimized, profiled"
+ echo " freebsd-debug FreeBSD, debug and warnings"
+ echo ""
+ echo " aix AIX, optimized"
+ echo " aix-profile AIX, optimized, profiled (NOT TESTED)"
+ echo " aix-debug AIX, debug"
+ echo ""
+ echo " tru64, compaq Tru64, optimized"
+ echo " tru64-debug, compaq-debug Tru64, debug, warnings, trapuv"
+ echo ""
+ echo " linux Linux, i686, optimized"
+ echo " linux64 Linux, Opteron, optimized"
+ echo ""
+ echo " solaris Solaris, gcc, optimized (STALE)"
+ exit
+ ;;
+esac
+
+
+cat <<EOF >> Make.compilers
+PERL := /usr/bin/perl
+.EXE :=
+.SO := .so
+.A := .a
+.O := .o
+CLD := \${CC}
+CXXLD := \${CXX}
+CCDEP := gcc -MM -MG
+CXXDEP := g++ -MM -MG
+CLIBS += -lm -lbz2
+CXXLIBS += -lm -lbz2
+PYTHON := $PYTHON
+PYTHON_H := $CFLAGS_PYTHON/Python.h
+CFLAGS_PYTHON := -I$CFLAGS_PYTHON
+WITHOUT := $WITHOUT_ATAC
+EOF
+
+echo "Configured."
+
+#cat Make.compilers
diff --git a/leaff/Make.include b/leaff/Make.include
new file mode 100644
index 0000000..f238c5a
--- /dev/null
+++ b/leaff/Make.include
@@ -0,0 +1,18 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+
+$/.CXX_SRCS :=$/leaff.C $/blocks.C $/dups.C $/gc.C $/partition.C $/simseq.C $/stats.C
+$/.CXX_EXES :=$/leaff
+
+$/.CLEAN :=$/*.o
+
+$/leaff : $/leaff.o $/blocks.o $/dups.o $/gc.o $/partition.o $/simseq.o $/stats.o \
+ ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
+$(eval $/%.d $/%.o: CFLAGS +=-I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
+
+
diff --git a/leaff/blocks.C b/leaff/blocks.C
new file mode 100644
index 0000000..e6e682e
--- /dev/null
+++ b/leaff/blocks.C
@@ -0,0 +1,50 @@
+#include "bio++.H"
+#include "seqCache.H"
+
+void
+dumpBlocks(char *filename) {
+seqCache *F = 0L;
+ seqInCore *S = 0L;
+
+ bool V[256] = {0};
+
+ for (uint32 i=0; i<256; i++)
+ V[i] = false;
+
+ V['n'] = true;
+ V['N'] = true;
+
+ F = new seqCache(filename);
+
+ for (uint32 s=0; s<F->getNumberOfSequences(); s++) {
+ seqInCore *S = F->getSequenceInCore(s);
+
+ uint32 len = S->sequenceLength();
+ char begseq = S->sequence()[0];
+ bool nnn = V[begseq];
+ uint32 begpos = 0;
+ uint32 pos = 0;
+
+ for (pos=0; pos<len; pos++) {
+ char seq = S->sequence()[pos];
+
+ if (nnn != V[seq]) {
+ fprintf(stdout, "%c "uint32FMT" "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ begseq, s, begpos, pos, pos - begpos);
+ nnn = V[seq];
+ begpos = pos;
+ begseq = seq;
+ }
+ }
+
+ fprintf(stdout, "%c "uint32FMT" "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ begseq, s, begpos, pos, pos - begpos);
+ fprintf(stdout, ". "uint32FMT" "uint32FMT" "uint32FMT"\n", s, pos, uint32ZERO);
+
+ delete S;
+ }
+
+ delete F;
+}
+
+
diff --git a/leaff/dups.C b/leaff/dups.C
new file mode 100644
index 0000000..51d1869
--- /dev/null
+++ b/leaff/dups.C
@@ -0,0 +1,154 @@
+#include "bio++.H"
+#include "seqCache.H"
+
+
+md5_s *
+computeMD5ForEachSequence(seqCache *F) {
+ uint32 numSeqs = F->getNumberOfSequences();
+ md5_s *result = new md5_s [numSeqs];
+
+ for (uint32 idx=0; idx < numSeqs; idx++) {
+ seqInCore *s1 = F->getSequenceInCore(idx);
+ md5_string(result+idx, s1->sequence(), s1->sequenceLength());
+ result[idx].i = s1->getIID();
+ delete s1;
+ }
+
+ return(result);
+}
+
+
+void
+mapDuplicates_Print(char *filea, seqInCore *sa,
+ char *fileb, seqInCore *sb) {
+
+ if (strcmp(sa->sequence(), sb->sequence()) == 0)
+ fprintf(stdout, uint32FMT" <-> "uint32FMT"\n", sa->getIID(), sb->getIID());
+ else
+ fprintf(stderr, "COLLISION DETECTED BETWEEN %s:"uint32FMT" AND %s:"uint32FMT"!\nPLEASE REPORT THIS TO bri at walenz.org!\n",
+ filea, sa->getIID(), fileb, sb->getIID());
+}
+
+
+
+void
+findDuplicates(char *filename) {
+ seqInCore *s1 = 0L;
+ seqInCore *s2 = 0L;
+ seqCache *A = new seqCache(filename);
+
+ uint32 numSeqs = A->getNumberOfSequences();
+
+ fprintf(stderr, "Computing MD5's for each sequence in '%s'.\n", filename);
+ md5_s *result = computeMD5ForEachSequence(A);
+
+ fprintf(stderr, "Sorting MD5's.\n");
+ qsort(result, numSeqs, sizeof(md5_s), md5_compare);
+
+ fprintf(stderr, "Verifying identity, and output\n");
+ for (uint32 idx=1; idx<numSeqs; idx++) {
+ if (md5_compare(result+idx-1, result+idx) == 0) {
+ if (result[idx-1].i == result[idx].i) {
+ fprintf(stderr, "Internal error: found two copies of the same sequence iid ("uint32FMT")!\n", result[idx].i);
+ exit(1);
+ }
+
+ s1 = A->getSequenceInCore(result[idx-1].i);
+ s2 = A->getSequenceInCore(result[idx].i);
+
+ if (strcmp(s1->sequence(), s2->sequence()) == 0) {
+ fprintf(stdout, uint32FMT":%s\n"uint32FMT":%s\n\n",
+ result[idx-1].i, s1->header(),
+ result[idx ].i, s2->header());
+ } else {
+ fprintf(stderr, "COLLISION DETECTED BETWEEN IID "uint32FMT" AND "uint32FMT"!\nPLEASE REPORT THIS TO bri at walenz.org!\n",
+ result[idx-1].i, result[idx].i);
+ }
+
+ delete s1;
+ delete s2;
+ }
+ }
+
+ delete [] result;
+ delete A;
+}
+
+
+
+void
+mapDuplicates(char *filea, char *fileb) {
+ fprintf(stderr, "Computing MD5's for each sequence in '%s'.\n", filea);
+ seqCache *A = new seqCache(filea);
+ md5_s *resultA = computeMD5ForEachSequence(A);
+
+ fprintf(stderr, "Computing MD5's for each sequence in '%s'.\n", fileb);
+ seqCache *B = new seqCache(fileb);
+ md5_s *resultB = computeMD5ForEachSequence(B);
+
+ uint32 numSeqsA = A->getNumberOfSequences();
+ uint32 numSeqsB = B->getNumberOfSequences();
+ uint32 idxA = 0;
+ uint32 idxB = 0;
+
+ fprintf(stderr, "Sorting MD5's.\n");
+ qsort(resultA, numSeqsA, sizeof(md5_s), md5_compare);
+ qsort(resultB, numSeqsB, sizeof(md5_s), md5_compare);
+
+ fprintf(stderr, "Finding duplicates.\n");
+ while ((idxA<numSeqsA) && (idxB<numSeqsB)) {
+ int res = md5_compare(resultA+idxA, resultB+idxB);
+
+ if (res == 0) {
+ seqInCore *sa = A->getSequenceInCore(resultA[idxA].i);
+ seqInCore *sb = B->getSequenceInCore(resultB[idxB].i);
+
+ mapDuplicates_Print(filea, sa, fileb, sb);
+
+ // While the B sequence matches the current A sequence, output a match
+ //
+ uint32 idxBb = idxB+1;
+ int resb = md5_compare(resultA+idxA, resultB+idxBb);
+ while (resb == 0) {
+ seqInCore *sbb = B->getSequenceInCore(resultB[idxBb].i);
+
+ mapDuplicates_Print(filea, sa, fileb, sbb);
+
+ delete sbb;
+
+ idxBb++;
+ resb = md5_compare(resultA+idxA, resultB+idxBb);
+ }
+
+ // And likewise for A
+ //
+ uint32 idxAa = idxA+1;
+ int resa = md5_compare(resultA+idxAa, resultB+idxB);
+ while (resa == 0) {
+ seqInCore *saa = A->getSequenceInCore(resultA[idxAa].i);
+
+ mapDuplicates_Print(filea, saa, fileb, sb);
+
+ delete saa;
+
+ idxAa++;
+ resa = md5_compare(resultA+idxAa, resultB+idxB);
+ }
+
+ delete sa;
+ delete sb;
+
+ idxA++;
+ idxB++;
+ } else {
+ if (res < 0)
+ idxA++;
+ else
+ idxB++;
+ }
+ }
+
+ delete A;
+ delete B;
+}
+
diff --git a/leaff/fragmenter.C b/leaff/fragmenter.C
new file mode 100644
index 0000000..689d2c8
--- /dev/null
+++ b/leaff/fragmenter.C
@@ -0,0 +1,191 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "bio++.H"
+#include "seqCache.H"
+
+// Splits a sequence into itty-bitty pieces.
+//
+// By default, splits into non-overlapping pieces of length L.
+// Pieces will not start with nor end with N, but may have embedded N's.
+//
+// All pieces will be at least L long. Most pieces will be exactly L
+// long. All pieces will be less than 2L long.
+//
+// If a piece has more than (currently) 50 N's, it will be broken --
+// the first piece and last piece will be saved, and the middle (with
+// the N's) will be discarded.
+
+
+void
+usage(char *name) {
+ fprintf(stderr, "usage: %s [-overlap len] -length len -input X.fasta -output Y.fasta -log T.log\n",
+ name);
+ exit(1);
+}
+
+
+int
+main(int argc, char **argv) {
+ uint32 desiredLength = 0;
+ uint32 overlapLength = 0;
+ bool beVerbose = false;
+ seqCache *F = 0L;
+ seqInCore *B = 0L;
+ uint32 Bid = 0;
+ FILE *O = 0L;
+ FILE *L = 0L;
+
+ uint32 fragmentIndex = 0;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-length") == 0) {
+ desiredLength = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-overlap") == 0) {
+ overlapLength = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-input") == 0) {
+ F = new seqCache(argv[++arg]);
+ } else if (strcmp(argv[arg], "-output") == 0) {
+ errno = 0;
+ O = fopen(argv[++arg], "w");
+ if (errno)
+ fprintf(stderr, "ERROR: Can't open output file '%s': %s\n", argv[arg], strerror(errno)), exit(1);
+ } else if (strcmp(argv[arg], "-log") == 0) {
+ errno = 0;
+ L = fopen(argv[++arg], "w");
+ if (errno)
+ fprintf(stderr, "ERROR: Can't open log file '%s': %s\n", argv[arg], strerror(errno)), exit(1);
+ } else if (strcmp(argv[arg], "-verbose") == 0) {
+ beVerbose = true;
+ } else {
+ usage(argv[arg]);
+ }
+
+ arg++;
+ }
+
+ if ((F == 0L) || (O == 0L) || (L == 0L))
+ usage(argv[0]);
+
+ B = F->getSequenceInCore(Bid);
+ while (B) {
+ if (beVerbose)
+ fprintf(stderr, "working on %s\n", B->header());
+
+ char *seq = (char *)B->sequence();
+
+ uint32 pos = 0;
+ uint32 max = 16384;
+ uint32 *sta = new uint32 [max];
+ uint32 *end = new uint32 [max];
+
+ // step 1: build a list of regions to output. Scan the sequence,
+ // making a new region if we see a significant chunk of N, or if we
+ // hit the desiredLength.
+ //
+ uint32 s = 0;
+ uint32 e = 0;
+ while (s < B->sequenceLength()) {
+
+ // Skip any N at the start
+ while ((seq[s] == 'n') || (seq[s] == 'N') && (s < B->sequenceLength()))
+ s++;
+
+ // Construct the preliminary block.
+ //
+ e = s + desiredLength;
+ if (e > B->sequenceLength())
+ e = B->sequenceLength();
+
+ fprintf(stderr, "got block1 "uint32FMT" - "uint32FMT"\n", s, e);
+
+ // Scan from s to e, looking for significant N. If we find it,
+ // reset e and stop.
+ //
+ uint32 numN = 0;
+ for (uint32 i=s; i<e; i++) {
+ if ((seq[i] == 'n') || (seq[i] == 'N')) {
+ numN++;
+ } else {
+ numN = 0;
+ }
+ if (numN >= 50) {
+ e = i;
+ break;
+ }
+ }
+
+ fprintf(stderr, "got block2 "uint32FMT" - "uint32FMT"\n", s, e);
+
+ // Back up e until we hit the first non-N
+ if ((s < e) && ((seq[e] == 'n') || (seq[e] == 'N'))) {
+ while ((s <= e) && ((seq[e] == 'n') || (seq[e] == 'N')))
+ e--;
+ e++;
+ }
+
+ fprintf(stderr, "got block3 "uint32FMT" - "uint32FMT"\n", s, e);
+
+ // Add this region
+ //
+ if (s > e) {
+ fprintf(stderr, "ERROR! s>e! "uint32FMT" "uint32FMT"\n", s, e);
+ }
+ if (s != e) {
+ fprintf(stderr, "ADD ["uint32FMTW(3)"] "uint32FMTW(9)" "uint32FMTW(9)" length "uint32FMTW(9)"\n", pos, s, e, e-s);
+ sta[pos] = s;
+ end[pos] = e;
+ pos++;
+ if (pos >= max) {
+ fprintf(stderr, "ERROR! max exceeded!\n");
+ }
+ }
+
+ s = e;
+ }
+
+
+ // If we're supposed to be overlapping, fiddle with the begin position to make it so.
+ //
+ if (overlapLength > 0) {
+ for (uint32 p=1; p<pos; p++) {
+ if (end[p-1] == sta[p]) {
+ sta[p] -= overlapLength;
+ fprintf(stderr, "ADJ ["uint32FMTW(3)"] "uint32FMTW(9)" "uint32FMTW(9)" length "uint32FMTW(9)"\n",
+ p, sta[p], end[p], end[p] - sta[p]);
+ }
+ }
+ }
+
+
+ if (beVerbose)
+ fprintf(stderr, "created %d regions\n", pos+1);
+
+
+ for (uint32 p=0; p<pos; p++) {
+
+#if 1
+ fprintf(O, "%s begin "uint32FMT" end "uint32FMT" length "uint32FMT"\n",
+ B->header(), sta[p], end[p], end[p] - sta[p]);
+ fwrite(seq+sta[p], sizeof(char), end[p] - sta[p], O);
+ fprintf(O, "\n");
+#endif
+
+ fprintf(L, uint32FMT" : "uint32FMT"["uint32FMT"-"uint32FMT"]\n",
+ fragmentIndex++,
+ B->getIID(),
+ sta[p],
+ end[p]);
+ }
+
+ delete [] sta;
+ delete [] end;
+
+ delete B;
+ B = F->getSequenceInCore(++Bid);
+ }
+
+ fclose(L);
+ fclose(O);
+}
diff --git a/leaff/gc.C b/leaff/gc.C
new file mode 100644
index 0000000..8387d68
--- /dev/null
+++ b/leaff/gc.C
@@ -0,0 +1,86 @@
+#include "bio++.H"
+#include "seqCache.H"
+
+
+void
+computeGCcontent(char *filename) {
+ seqCache *A = new seqCache(filename);
+
+ for (uint32 idx=0; idx < A->getNumberOfSequences(); idx++) {
+ seqInCore *S = A->getSequenceInCore(idx);
+ char *s = S->sequence();
+ uint32 genomeLength = S->sequenceLength();
+
+ fprintf(stdout, ">%s\n", S->header());
+
+ int gc[256] = {0};
+ gc['c'] = 1;
+ gc['C'] = 1;
+ gc['g'] = 1;
+ gc['G'] = 1;
+
+ // Replace the sequence with "g or c". We can't do this inline,
+ // since output reports the sequence too. The extra 1000 at the
+ // end is important, since we do not bother checking for the end
+ // of the valid data, just assume that it's zero.
+ //
+ char *g = new char [S->sequenceLength() + 1000];
+ for (uint32 i=0; i<genomeLength+1000; i++)
+ g[i] = 0;
+ for (uint32 i=0; i<genomeLength; i++)
+ g[i] = gc[s[i]];
+
+ // This stolen from depthOfPolishes.C
+
+ uint32 ave3 = 0;
+ uint32 ave5 = 0;
+ uint32 ave11 = 0;
+ uint32 ave51 = 0;
+ uint32 ave101 = 0;
+ uint32 ave201 = 0;
+ uint32 ave501 = 0;
+ uint32 ave1001 = 0;
+ uint32 ave2001 = 0;
+
+ // Preload the averages
+ ave3 += g[0];
+ ave5 += g[0] + g[1];
+
+ for (uint32 i=0; i<5; i++) ave11 += g[i];
+ for (uint32 i=0; i<25; i++) ave51 += g[i];
+ for (uint32 i=0; i<50; i++) ave101 += g[i];
+ for (uint32 i=0; i<100; i++) ave201 += g[i];
+ for (uint32 i=0; i<250; i++) ave501 += g[i];
+ for (uint32 i=0; i<500; i++) ave1001 += g[i];
+ for (uint32 i=0; i<1000; i++) ave2001 += g[i];
+
+ for (uint32 i=0; i<genomeLength; i++) {
+ ave3 += g[i+1] - ((i > 1) ? g[i-2] : 0);
+ ave5 += g[i+2] - ((i > 2) ? g[i-3] : 0);
+ ave11 += g[i+5] - ((i > 5) ? g[i-6] : 0);
+ ave51 += g[i+25] - ((i > 25) ? g[i-25] : 0);
+ ave101 += g[i+50] - ((i > 50) ? g[i-51] : 0);
+ ave201 += g[i+100] - ((i > 100) ? g[i-101] : 0);
+ ave501 += g[i+250] - ((i > 250) ? g[i-251] : 0);
+ ave1001 += g[i+500] - ((i > 500) ? g[i-501] : 0);
+ ave2001 += g[i+1000] - ((i > 1000) ? g[i-1001] : 0);
+
+ fprintf(stdout, uint32FMT"\t"uint32FMT"\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
+ i,
+ s[i],
+ ave3 / (double)((i >= 1) ? 3 - ((i < genomeLength - 1) ? 0 : i + 2 - genomeLength) : i+2),
+ ave5 / (double)((i >= 2) ? 5 - ((i < genomeLength - 2) ? 0 : i + 3 - genomeLength) : i+3),
+ ave11 / (double)((i >= 5) ? 11 - ((i < genomeLength - 4) ? 0 : i + 5 - genomeLength) : i+6),
+ ave51 / (double)((i >= 25) ? 51 - ((i < genomeLength - 24) ? 0 : i + 25 - genomeLength) : i+26),
+ ave101 / (double)((i >= 50) ? 101 - ((i < genomeLength - 49) ? 0 : i + 50 - genomeLength) : i+51),
+ ave201 / (double)((i >= 100) ? 201 - ((i < genomeLength - 99) ? 0 : i + 100 - genomeLength) : i+101),
+ ave501 / (double)((i >= 250) ? 501 - ((i < genomeLength - 249) ? 0 : i + 250 - genomeLength) : i+251),
+ ave1001 / (double)((i >= 500) ? 1001 - ((i < genomeLength - 499) ? 0 : i + 500 - genomeLength) : i+501),
+ ave2001 / (double)((i >= 1000) ? 2001 - ((i < genomeLength - 999) ? 0 : i + 1000 - genomeLength) : i+1001));
+ }
+
+ delete [] g;
+ delete S;
+ }
+}
+
diff --git a/leaff/leaff.C b/leaff/leaff.C
new file mode 100644
index 0000000..592a58e
--- /dev/null
+++ b/leaff/leaff.C
@@ -0,0 +1,809 @@
+#include "bio++.H"
+#include "seqCache.H"
+#include "seqStore.H"
+
+
+// Analysis functions
+//
+void dumpBlocks(char *filename);
+void stats(char *filename, uint64 refLen);
+void partitionBySize(char *prefix, uint64 partitionSize, char *filename);
+void partitionByBucket(char *prefix, uint64 partitionSize, char *filename);
+void partitionBySegment(char *prefix, uint64 numSegments, char *filename);
+void simseq(char *,char *,int,int,int,int,double);
+void computeGCcontent(char *name);
+void findDuplicates(char *filename);
+void mapDuplicates(char *filea, char *fileb);
+
+void processFile(char *filename);
+void processArray(int argc, char **argv);
+
+bool doReverse = false;
+bool doComplement = false;
+bool withDefLine = true;
+char *specialDefLine = 0L;
+uint32 withLineBreaks = 0;
+
+bool toUppercase = false;
+char translate[256] = {0};
+
+seqCache *fasta = 0L;
+
+uint32 begPos = (uint32)0;
+uint32 endPos = ~(uint32)0;
+
+uint32 endExtract = ~(uint32)0;
+
+mt_s *mtctx = 0L;
+
+
+static
+void
+failIfNoSource(void) {
+ if (fasta == 0L)
+ fprintf(stderr, "No source file specified.\n"), exit(1);
+}
+
+static
+void
+failIfNotRandomAccess(void) {
+ if (fasta->randomAccessSupported() == false)
+ fprintf(stderr, "Algorithm required random access; soruce file not supported.\n"), exit(1);
+}
+
+
+static
+void
+helpStandard(char *program) {
+ fprintf(stderr, "usage: %s [-f fasta-file] [options]\n", program);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "SOURCE FILES\n");
+ fprintf(stderr, " -f file: use sequence in 'file' (-F is also allowed for historical reasons)\n");
+ fprintf(stderr, " -A file: read actions from 'file'\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "SOURCE FILE EXAMINATION\n");
+ fprintf(stderr, " -d: print the number of sequences in the fasta\n");
+ fprintf(stderr, " -i name: print an index, labelling the source 'name'\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "OUTPUT OPTIONS\n");
+ fprintf(stderr, " -6 <#>: insert a newline every 60 letters\n");
+ fprintf(stderr, " (if the next arg is a number, newlines are inserted every\n");
+ fprintf(stderr, " n letters, e.g., -6 80. Disable line breaks with -6 0,\n");
+ fprintf(stderr, " or just don't use -6!)\n");
+ fprintf(stderr, " -e beg end: Print only the bases from position 'beg' to position 'end'\n");
+ fprintf(stderr, " (space based, relative to the FORWARD sequence!) If\n");
+ fprintf(stderr, " beg == end, then the entire sequence is printed. It is an\n");
+ fprintf(stderr, " error to specify beg > end, or beg > len, or end > len.\n");
+ fprintf(stderr, " -ends n Print n bases from each end of the sequence. One input\n");
+ fprintf(stderr, " sequence generates two output sequences, with '_5' or '_3'\n");
+ fprintf(stderr, " appended to the ID. If 2n >= length of the sequence, the\n");
+ fprintf(stderr, " sequence itself is printed, no ends are extracted (they\n");
+ fprintf(stderr, " overlap).\n");
+ fprintf(stderr, " -C: complement the sequences\n");
+ fprintf(stderr, " -H: DON'T print the defline\n");
+ fprintf(stderr, " -h: Use the next word as the defline (\"-H -H\" will reset to the\n");
+ fprintf(stderr, " original defline\n");
+ fprintf(stderr, " -R: reverse the sequences\n");
+ fprintf(stderr, " -u: uppercase all bases\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "SEQUENCE SELECTION\n");
+ fprintf(stderr, " -G n s l: print n randomly generated sequences, 0 < s <= length <= l\n");
+ fprintf(stderr, " -L s l: print all sequences such that s <= length < l\n");
+ fprintf(stderr, " -N l h: print all sequences such that l <= %% N composition < h\n");
+ fprintf(stderr, " (NOTE 0.0 <= l < h < 100.0)\n");
+ fprintf(stderr, " (NOTE that you cannot print sequences with 100%% N\n");
+ fprintf(stderr, " This is a useful bug).\n");
+ fprintf(stderr, " -q file: print sequences from the seqid list in 'file'\n");
+ fprintf(stderr, " -r num: print 'num' randomly picked sequences\n");
+ fprintf(stderr, " -s seqid: print the single sequence 'seqid'\n");
+ fprintf(stderr, " -S f l: print all the sequences from ID 'f' to 'l' (inclusive)\n");
+ fprintf(stderr, " -W: print all sequences (do the whole file)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "LONGER HELP\n");
+ fprintf(stderr, " -help analysis\n");
+ fprintf(stderr, " -help examples\n");
+}
+
+
+static
+void
+helpAnalysis(char *program) {
+ fprintf(stderr, "usage: %s [-f <fasta-file>] [options]\n", program);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --findduplicates a.fasta\n");
+ fprintf(stderr, " Reports sequences that are present more than once. Output\n");
+ fprintf(stderr, " is a list of pairs of deflines, separated by a newline.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --mapduplicates a.fasta b.fasta\n");
+ fprintf(stderr, " Builds a map of IIDs from a.fasta and b.fasta that have\n");
+ fprintf(stderr, " identical sequences. Format is \"IIDa <-> IIDb\"\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --md5 a.fasta:\n");
+ fprintf(stderr, " Don't print the sequence, but print the md5 checksum\n");
+ fprintf(stderr, " (of the entire sequence) followed by the entire defline.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --partition prefix [ n[gmk]bp | n ] a.fasta\n");
+ fprintf(stderr, " --partitionmap [ n[gmk]bp | n ] a.fasta\n");
+ fprintf(stderr, " Partition the sequences into roughly equal size pieces of\n");
+ fprintf(stderr, " size nbp, nkbp, nmbp or ngbp; or into n roughly equal sized\n");
+ fprintf(stderr, " parititions. Sequences larger that the partition size are\n");
+ fprintf(stderr, " in a partition by themself. --partitionmap writes a\n");
+ fprintf(stderr, " description of the partition to stdout; --partiton creates\n");
+ fprintf(stderr, " a fasta file 'prefix-###.fasta' for each partition.\n");
+ fprintf(stderr, " Example: -F some.fasta --partition parts 130mbp\n");
+ fprintf(stderr, " -F some.fasta --partition parts 16\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --segment prefix n a.fasta\n");
+ fprintf(stderr, " Splits the sequences into n files, prefix-###.fasta.\n");
+ fprintf(stderr, " Sequences are not reordered.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --gccontent a.fasta\n");
+ fprintf(stderr, " Reports the GC content over a sliding window of\n");
+ fprintf(stderr, " 3, 5, 11, 51, 101, 201, 501, 1001, 2001 bp.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --testindex a.fasta\n");
+ fprintf(stderr, " Test the index of 'file'. If index is up-to-date, leaff\n");
+ fprintf(stderr, " exits successfully, else, leaff exits with code 1. If an\n");
+ fprintf(stderr, " index file is supplied, that one is tested, otherwise, the\n");
+ fprintf(stderr, " default index file name is used.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --dumpblocks a.fasta\n");
+ fprintf(stderr, " Generates a list of the blocks of N and non-N. Output\n");
+ fprintf(stderr, " format is 'base seq# beg end len'. 'N 84 483 485 2' means\n");
+ fprintf(stderr, " that a block of 2 N's starts at space-based position 483\n");
+ fprintf(stderr, " in sequence ordinal 84. A '.' is the end of sequence\n");
+ fprintf(stderr, " marker.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --errors L N C P a.fasta\n");
+ fprintf(stderr, " For every sequence in the input file, generate new\n");
+ fprintf(stderr, " sequences including simulated sequencing errors.\n");
+ fprintf(stderr, " L -- length of the new sequence. If zero, the length\n");
+ fprintf(stderr, " of the original sequence will be used.\n");
+ fprintf(stderr, " N -- number of subsequences to generate. If L=0, all\n");
+ fprintf(stderr, " subsequences will be the same, and you should use\n");
+ fprintf(stderr, " C instead.\n");
+ fprintf(stderr, " C -- number of copies to generate. Each of the N\n");
+ fprintf(stderr, " subsequences will have C copies, each with different\n");
+ fprintf(stderr, " errors.\n");
+ fprintf(stderr, " P -- probability of an error.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " HINT: to simulate ESTs from genes, use L=500, N=10, C=10\n");
+ fprintf(stderr, " -- make C=10 sequencer runs of N=10 EST sequences\n");
+ fprintf(stderr, " of length 500bp each.\n");
+ fprintf(stderr, " to simulate mRNA from genes, use L=0, N=10, C=10\n");
+ fprintf(stderr, " to simulate reads from genomes, use L=800, N=10, C=1\n");
+ fprintf(stderr, " -- of course, N= should be increased to give the\n");
+ fprintf(stderr, " appropriate depth of coverage\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --stats a.fasta [refLen]\n");
+ fprintf(stderr, " Reports size statistics; number, N50, sum, largest.\n");
+ fprintf(stderr, " If 'refLen' is supplied, N50 is based on this size.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " --seqstore out.seqStore\n");
+ fprintf(stderr, " Converts the input file (-f) to a seqStore file.\n");
+}
+
+
+static
+void
+helpExamples(char *program) {
+ fprintf(stderr, "usage: %s [-f <fasta-file>] [options]\n", program);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Options are ORDER DEPENDENT. Sequences are printed whenever an ACTION occurs\n");
+ fprintf(stderr, "on the command line. SEQUENCE OPTIONS are not reset when a sequence is printed.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "SEQUENCES are numbered starting at ZERO, not one.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Print the first 10 bases of the fourth sequence in file 'genes':\n");
+ fprintf(stderr, " -f genes -e 0 10 -s 3\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Print the first 10 bases of the fourth and fifth sequences:\n");
+ fprintf(stderr, " -f genes -e 0 10 -s 3 -s 4\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Print the fourth and fifth sequences reverse complemented, and the sixth\n");
+ fprintf(stderr, " sequence forward. The second set of -R -C toggle off reverse-complement:\n");
+ fprintf(stderr, " -f genes -R -C -s 3 -s 4 -R -C -s 5\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Convert file 'genes' to a seqStore 'genes.seqStore'. The seqStore\n");
+ fprintf(stderr, " provides better performance with the kmer tools.\n");
+ fprintf(stderr, " -f genes --seqstore genes.seqStore\n");
+}
+
+
+static
+void
+printSequence(char *def,
+ char *seq,
+ uint32 beg,
+ uint32 end) {
+
+ if (beg >= end)
+ return;
+
+ if ((endExtract != ~uint32ZERO) &&
+ (endExtract + endExtract < end - beg)) {
+ char d[1024];
+ uint32 l = strlen(seq);
+
+ sprintf(d, "%s_5", def);
+ printSequence(d, seq, 0, endExtract);
+
+ sprintf(d, "%s_3", def);
+ printSequence(d, seq, l-endExtract, l);
+
+ return;
+ }
+
+ if (specialDefLine)
+ def = specialDefLine;
+
+ if (withDefLine == false)
+ def = 0L;
+
+ uint32 limit = end - beg;
+ char *n = new char [end - beg + 1];
+ char *m;
+
+ if ((doReverse == false) && (doComplement == false)) {
+ m = n;
+ seq += beg;
+ while (limit--)
+ *(m++) = translate[*(seq++)];
+
+ } else if ((doReverse == true) && (doComplement == false)) {
+ m = n + limit - 1;
+ seq += beg;
+ while (limit--)
+ *(m--) = translate[*(seq++)];
+
+ } else if ((doReverse == false) && (doComplement == true)) {
+ m = n;
+ seq += beg;
+ while (limit--)
+ *(m++) = complementSymbol[translate[*(seq++)]];
+
+ } else if ((doReverse == true) && (doComplement == true)) {
+ m = n + limit - 1;
+ seq += beg;
+ while (limit--)
+ *(m--) = complementSymbol[translate[*(seq++)]];
+ }
+
+ n[end-beg] = 0;
+
+ if (def)
+ fprintf(stdout, ">%s\n", def);
+
+ if (withLineBreaks) {
+ char *t = n;
+ char *a = new char [withLineBreaks+1];
+
+ while (*t) {
+ uint32 i=0;
+ while ((*t) && (i < withLineBreaks))
+ a[i++] = *(t++);
+ a[i++] = '\n';
+ a[i] = 0;
+ fprintf(stdout, "%s", a);
+ }
+
+ delete [] a;
+ } else {
+ fprintf(stdout, "%s\n", n);
+ }
+
+ delete [] n;
+}
+
+
+static
+void
+printSequence(seqInCore *sic) {
+ printSequence(sic->header(), sic->sequence(), (begPos!=(uint32)0) ? begPos:0, (endPos!=~uint32(0)) ? endPos:sic->sequenceLength());
+}
+
+
+static
+void
+printSequence(uint32 sid) {
+ seqInCore *sic = fasta->getSequenceInCore(sid);
+ if (sic == 0L)
+ fprintf(stderr, "WARNING: Didn't find sequence with iid '"uint32FMT"'\n", sid);
+ else
+ printSequence(sic);
+ delete sic;
+}
+
+
+static
+void
+printSequence(char *sid) {
+ seqInCore *sic = fasta->getSequenceInCore(sid);
+ if (sic == 0L)
+ fprintf(stderr, "WARNING: Didn't find sequence with name/iid '%s'\n", sid);
+ else
+ printSequence(sic);
+ delete sic;
+}
+
+
+static
+void
+printIDsFromFile(char *name) {
+ uint32 idLen = 0;
+ uint32 idMax = 63;
+ char *id = new char [idMax+1];
+
+ readBuffer B(name);
+ char x = B.read();
+
+ // For optimal performance, we should sort the list of ID's given
+ // by their IID, but the user might have a good reason for wanting
+ // them unsorted.
+
+ while (B.eof() == false) {
+ while (whitespaceSymbol[x] && (B.eof() == false))
+ x = B.read();
+
+ if (B.eof() == false) {
+ idLen = 0;
+
+ while (!whitespaceSymbol[x] && (B.eof() == false)) {
+ id[idLen++] = x;
+ x = B.read();
+
+ if (idLen >= idMax) {
+ idMax *= 2;
+ char *newid = new char [idMax+1];
+ memcpy(newid, id, sizeof(char) * idLen);
+ delete [] id;
+ id = newid;
+ }
+ }
+
+ id[idLen] = 0;
+
+ seqInCore *S = fasta->getSequenceInCore(id);
+
+ if (S == 0L)
+ fprintf(stderr, "WARNING: Didn't find sequence with name/iid '%s'\n", id);
+ else
+ printSequence(S);
+ }
+ }
+
+ delete [] id;
+}
+
+
+void
+processArray(int argc, char **argv) {
+
+ int arg = 1;
+ while (arg < argc) {
+
+ if ((strcmp(argv[arg], "-f") == 0) ||
+ (strcmp(argv[arg], "-F") == 0)) {
+ delete fasta;
+ fasta = new seqCache(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-i") == 0) {
+
+ failIfNoSource();
+
+ ++arg;
+ if ((argv[arg] == 0L) || (argv[arg][0] == '-'))
+ fprintf(stderr, "ERROR: next arg to -i should be 'name', I got '%s'\n",
+ (argv[arg] == 0L) ? "(nullpointer)" : argv[arg]), exit(1);
+
+ for (uint32 s=0; s<fasta->getNumberOfSequences(); s++)
+ fprintf(stdout, "G\tseq\t%s:"uint32FMT"\t"uint32FMT"\t%s\n",
+ argv[arg], s, fasta->getSequenceLength(s), ">unimplemented");
+
+ } else if (strcmp(argv[arg], "-d") == 0) {
+ failIfNoSource();
+ printf(uint32FMT"\n", fasta->getNumberOfSequences());
+
+ } else if (strcmp(argv[arg], "-L") == 0) {
+ uint32 small = strtouint32(argv[++arg], 0L);
+ uint32 large = strtouint32(argv[++arg], 0L);
+
+ failIfNoSource();
+
+ for (uint32 s=0; s<fasta->getNumberOfSequences(); s++)
+ if ((small <= fasta->getSequenceLength(s)) && (fasta->getSequenceLength(s) < large))
+ printSequence(s);
+
+ } else if (strcmp(argv[arg], "-N") == 0) {
+ double small = atof(argv[++arg]);
+ double large = atof(argv[++arg]);
+
+ failIfNoSource();
+
+ for (uint32 s=0; s<fasta->getNumberOfSequences(); s++) {
+ seqInCore *S = fasta->getSequenceInCore(s);
+ uint32 Ns = 0;
+ uint32 len = S->sequenceLength();
+ char *seq = S->sequence();
+
+ for (uint32 i=begPos; i<len && i<endPos; i++)
+ if ((seq[i] == 'n') || (seq[i] == 'N'))
+ Ns++;
+
+ double Np = 100.0 * Ns / len;
+
+ if ((small <= Np) && (Np < large))
+ printSequence(S);
+
+ delete S;
+ }
+
+ } else if (strcmp(argv[arg], "-W") == 0) {
+ failIfNoSource();
+
+ for (uint32 s=0; s<fasta->getNumberOfSequences(); s++)
+ printSequence(s);
+
+ } else if (strcmp(argv[arg], "-G") == 0) {
+ uint32 n = strtouint32(argv[++arg], 0L);
+ uint32 s = strtouint32(argv[++arg], 0L);
+ uint32 l = strtouint32(argv[++arg], 0L);
+
+ char bases[4] = {'A', 'C', 'G', 'T'};
+ char *def = new char [1024];
+ char *seq = new char [l + 1];
+
+ if (s == 0)
+ s = 1;
+ if (s > l)
+ fprintf(stderr, "leaff: usage: -G num-seqs min-length max-length\n"), exit(1);
+
+ for (uint32 i=0; i<n; i++) {
+ uint32 j = s + ((l-s == 0) ? 0 : (mtRandom32(mtctx) % (l-s)));
+ uint32 p = 0;
+
+ while (p < j)
+ seq[p++] = bases[mtRandom32(mtctx) & 0x3];
+ seq[p] = 0;
+
+ sprintf(def, "random"uint32FMTW(06), i);
+
+ printSequence(def, seq, 0, j);
+ }
+
+ delete [] seq;
+ delete [] def;
+
+ } else if (strcmp(argv[arg], "-s") == 0) {
+ failIfNoSource();
+ failIfNotRandomAccess(); // Easy to fix, just read the first N sequences
+ printSequence(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-S") == 0) {
+ failIfNoSource();
+ failIfNotRandomAccess(); // Easy to fix, just read the first N sequences
+
+ uint32 lowID = fasta->getSequenceIID(argv[++arg]);
+ uint32 highID = fasta->getSequenceIID(argv[++arg]);
+
+ if (lowID > highID) {
+ uint32 t = lowID;
+ lowID = highID;
+ highID = t;
+ }
+
+ for (uint32 s=lowID; (s <= highID) && (s <= fasta->getNumberOfSequences()); s++)
+ printSequence(s);
+
+ } else if (strcmp(argv[arg], "-r") == 0) {
+ uint32 num = strtouint32(argv[++arg], 0L);
+
+ failIfNoSource();
+ failIfNotRandomAccess(); // Impossible to fix, or load whole thing into memory
+
+ if (num >= fasta->getNumberOfSequences())
+ num = fasta->getNumberOfSequences();
+
+ uint32 *seqs = new uint32 [fasta->getNumberOfSequences()];
+
+ for (uint32 i=0; i<fasta->getNumberOfSequences(); i++)
+ seqs[i] = i;
+
+ for (uint32 i=0; i<fasta->getNumberOfSequences(); i++) {
+ uint32 j = mtRandom32(mtctx) % (fasta->getNumberOfSequences() - i) + i;
+ uint32 t = seqs[j];
+ seqs[j] = seqs[i];
+ seqs[i] = t;
+ }
+
+ for (uint32 i=0; i<num; i++)
+ printSequence(seqs[i]);
+
+ delete [] seqs;
+
+ } else if (strcmp(argv[arg], "-q") == 0) {
+ failIfNoSource();
+ failIfNotRandomAccess(); // Impossible to fix, or load whole thing into memory
+ printIDsFromFile(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-6") == 0) {
+ withLineBreaks = 60;
+ if ((argv[arg+1] != 0L) && (argv[arg+1][0] != '-'))
+ withLineBreaks = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-w") == 0) {
+ toUppercase = !toUppercase;
+ for (int z=0; z<256; z++)
+ translate[z] = (toUppercase) ? (char)toUpper[z] : (char)z;
+
+ } else if (strcmp(argv[arg], "-R") == 0) {
+ doReverse = !doReverse;
+
+ } else if (strcmp(argv[arg], "-C") == 0) {
+ doComplement = !doComplement;
+
+ } else if (strcmp(argv[arg], "-H") == 0) {
+ withDefLine = !withDefLine;
+ specialDefLine = 0L;
+
+ } else if (strcmp(argv[arg], "-h") == 0) {
+ withDefLine = true;
+ specialDefLine = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-e") == 0) {
+ begPos = strtouint32(argv[++arg], 0L);
+ endPos = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-ends") == 0) {
+ endExtract = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-A") == 0) {
+ processFile(argv[++arg]);
+
+
+
+ } else if (strcmp(argv[arg], "--findduplicates") == 0) {
+ findDuplicates(argv[++arg]);
+ exit(0);
+
+ } else if (strcmp(argv[arg], "--mapduplicates") == 0) {
+ mapDuplicates(argv[arg+1], argv[arg+2]);
+ exit(0);
+
+ } else if (strcmp(argv[arg], "--md5") == 0) {
+ md5_s md5;
+ char sum[33];
+
+ fasta = new seqCache(argv[++arg]);
+
+ for (uint32 s=0; s<fasta->getNumberOfSequences(); s++) {
+ seqInCore *S = fasta->getSequenceInCore(s);
+ fprintf(stdout, "%s %s\n",
+ md5_toascii(md5_string(&md5, S->sequence(), S->sequenceLength()), sum),
+ S->header());
+ delete S;
+ }
+ delete fasta;
+ exit(0);
+
+ } else if ((strcmp(argv[arg], "--partition") == 0) ||
+ (strcmp(argv[arg], "--partitionmap") == 0)) {
+
+ char *prefix = 0L;
+ if (strcmp(argv[arg], "--partition") == 0)
+ prefix = argv[++arg];
+
+ // does the next arg end with gbp, mbp, kbp or bp? If so,
+ // partition by length, else partition into buckets.
+ //
+ int al = strlen(argv[arg+1]);
+ uint64 ps = strtouint64(argv[arg+1], 0L);
+
+ char a3 = (al<3) ? '0' : (char)toLower[argv[arg+1][al-3]];
+ char a2 = (al<2) ? '0' : (char)toLower[argv[arg+1][al-2]];
+ char a1 = (al<1) ? '0' : (char)toLower[argv[arg+1][al-1]];
+
+ // partition!
+
+ if (!isdigit(a1) || !isdigit(a2) || !isdigit(a3)) {
+ if ((a3 == 'g') && (a2 == 'b') && (a1 == 'p')) {
+ ps *= 1000000000;
+ } else if ((a3 == 'm') && (a2 == 'b') && (a1 == 'p')) {
+ ps *= 1000000;
+ } else if ((a3 == 'k') && (a2 == 'b') && (a1 == 'p')) {
+ ps *= 1000;
+ } else if (isdigit(a3) && (a2 == 'b') && (a1 == 'p')) {
+ ps *= 1;
+ } else {
+ fprintf(stderr, "Unknown partition size option '%s'\n", argv[arg+1]), exit(1);
+ }
+
+ if (ps == 0)
+ fprintf(stderr, "Unknown or zero partition size '%s'\n", argv[arg+1]), exit(1);
+ partitionBySize(prefix, ps, argv[arg+2]);
+ } else {
+ if (ps == 0)
+ fprintf(stderr, "Unknown or zero partition size '%s'\n", argv[arg+1]), exit(1);
+ partitionByBucket(prefix, ps, argv[arg+2]);
+ }
+ exit(0);
+
+ } else if (strcmp(argv[arg], "--segment") == 0) {
+ partitionBySegment(argv[arg+1], strtouint32(argv[arg+2], 0L), argv[arg+3]);
+ exit(0);
+
+ } else if (strcmp(argv[arg], "--gccontent") == 0) {
+ computeGCcontent(argv[++arg]);
+ exit(0);
+
+ } else if (strcmp(argv[arg], "--dumpblocks") == 0) {
+ dumpBlocks(argv[++arg]);
+ exit(0);
+
+ } else if (strcmp(argv[arg], "--stats") == 0) {
+ stats(argv[arg+1], (argv[arg+2] != 0L) ? strtouint64(argv[arg+2], 0L) : 0);
+ exit(0);
+
+ } else if (strcmp(argv[arg], "--errors") == 0) {
+ int L = strtouint32(argv[++arg], 0L); // Desired length
+ int l = 0; // min of desired length, length of sequence
+ int N = strtouint32(argv[++arg], 0L); // number of copies per sequence
+ int C = strtouint32(argv[++arg], 0L); // number of mutations per copy
+ double P = atof(argv[++arg]); // probability of mutation
+ uint32 i = 0;
+
+ fasta = new seqCache(argv[++arg]);
+
+ seqInCore *S = fasta->getSequenceInCore(i++);
+ while (S) {
+ char *seq = S->sequence();
+ char *hdr = S->header();
+ int len = S->sequenceLength();
+
+ l = len;
+ if ((L > 0) && (L < len))
+ l = L;
+
+ simseq(seq, hdr, len, N, l, C, P);
+
+ delete S;
+ S = fasta->getSequenceInCore(i++);
+ }
+ delete fasta;
+ exit(0);
+
+ } else if (strcmp(argv[arg], "--seqstore") == 0) {
+ constructSeqStore(argv[++arg], fasta);
+ exit(0);
+
+ } else if (strcmp(argv[arg], "-help") == 0) {
+ if ((argv[arg+1]) && (strcmp(argv[arg+1], "analysis") == 0))
+ helpAnalysis(argv[0]);
+ else if ((argv[arg+1]) && (strcmp(argv[arg+1], "examples") == 0))
+ helpExamples(argv[0]);
+ else
+ helpStandard(argv[0]);
+ exit(0);
+
+ } else {
+ helpStandard(argv[0]);
+ fprintf(stderr, "Unknown option '%s'\n", argv[arg]);
+ exit(1);
+ }
+
+ arg++;
+ }
+
+ delete fasta;
+ fasta = 0L;
+}
+
+
+void
+processFile(char *filename) {
+ FILE *F = NULL;
+
+ if (strcmp(filename, "-") == 0) {
+ F = stdin;
+ } else {
+ errno = 0;
+ F = fopen(filename, "r");
+ if (errno)
+ fprintf(stderr, "Couldn't open '%s': %s\n", filename, strerror(errno)), exit(1);
+ }
+
+ uint64 max = 16 * 1024 * 1024;
+ uint64 pos = 0;
+ size_t len = 0;
+
+ char *data = new char [max];
+
+ // Suck the file into 'data'
+
+ while (!feof(F)) {
+ errno = 0;
+ len = fread(data+pos, 1, max - pos, F);
+ if (errno)
+ fprintf(stderr, "Couldn't read "uint64FMT" bytes from '%s': %s\n",
+ (uint64)(max-pos), filename, strerror(errno)), exit(1);
+
+ pos += len;
+
+ if (pos >= max) {
+ max += 16 * 1024 * 1024;
+ char *tmpd = new char [max];
+ memcpy(tmpd, data, pos);
+ delete [] data;
+ data = tmpd;
+ }
+ }
+
+ if (strcmp(filename, "-") != 0)
+ fclose(F);
+
+ len = pos;
+
+ // (over)count the number of words; we start at two, since the
+ // first arg is the name of the program, and if there is only one
+ // word and no whitespace in the file, the below loop fails to
+ // count the second word.
+
+ int argc = 2;
+ char **argv = 0L;
+
+ for (uint32 i=0; i<len; i++) {
+ if (isspace(data[i])) {
+ argc++;
+ data[i] = 0;
+ }
+ }
+
+ // Allocate space for word pointers, then set them. First arg in
+ // argv[] is the name of the program -- we'll substitute the name
+ // of the file instead.
+
+ argv = new char * [argc];
+
+ argv[0] = filename;
+ argc = 1;
+
+ // Three steps: Skip leading whitespace; save the arg if it's a
+ // real arg (and not the end of the file; then skip the word.
+
+ for (uint32 pos=0; pos<len; pos++) {
+ while ((data[pos] == 0) && (pos < len))
+ pos++;
+ if (pos < len)
+ argv[argc++] = data+pos;
+ while ((data[pos] != 0) && (pos < len))
+ pos++;
+ }
+
+ processArray(argc, argv);
+
+ delete [] argv;
+ delete [] data;
+}
+
+
+int
+main(int argc, char **argv) {
+
+ if (argc < 2) {
+ helpStandard(argv[0]);
+ exit(1);
+ }
+
+ for (int z=0; z<256; z++)
+ translate[z] = (char)z;
+
+ mtctx = mtInit(getpid() * time(NULL));
+
+ processArray(argc, argv);
+
+ delete fasta;
+}
+
+
+
+
diff --git a/leaff/partition.C b/leaff/partition.C
new file mode 100644
index 0000000..cc44dbf
--- /dev/null
+++ b/leaff/partition.C
@@ -0,0 +1,208 @@
+#include "bio++.H"
+#include "seqCache.H"
+
+#include <math.h>
+
+struct partition_s {
+ uint32 length;
+ uint32 index;
+ uint32 partition;
+};
+
+
+static
+int
+partition_s_compare(const void *A, const void *B) {
+ const partition_s *a = (const partition_s *)A;
+ const partition_s *b = (const partition_s *)B;
+ if (a->length < b->length)
+ return(1);
+ if (a->length > b->length)
+ return(-1);
+ return(0);
+}
+
+
+static
+partition_s *
+loadPartition(seqCache *F) {
+ uint32 n = F->getNumberOfSequences();
+ partition_s *p = new partition_s [n];
+
+ for (uint32 i=0; i<n; i++) {
+ p[i].length = F->getSequenceLength(i);
+ p[i].index = i;
+ p[i].partition = 0;
+ }
+
+ qsort(p, n, sizeof(partition_s), partition_s_compare);
+
+ return(p);
+}
+
+
+static
+void
+outputPartition(seqCache *F,
+ char *prefix,
+ partition_s *p, uint32 openP, uint32 n) {
+ char filename[1024];
+
+ // Check that everything has been partitioned
+ //
+ for (uint32 i=0; i<n; i++)
+ if (p[i].partition == 0)
+ fprintf(stderr, "ERROR: Failed to partition "uint32FMT"\n", i);
+
+ if (prefix) {
+
+ // This rewrites the source fasta file into partitioned fasta files
+ //
+ for (uint32 o=1; o<=openP; o++) {
+ sprintf(filename, "%s-"uint32FMTW(03)".fasta", prefix, o);
+
+ errno = 0;
+ FILE *file = fopen(filename, "w");
+ if (errno)
+ fprintf(stderr, "Couldn't open '%s' for write: %s\n", filename, strerror(errno));
+
+ for (uint32 i=0; i<n; i++)
+ if (p[i].partition == o) {
+ seqInCore *S = F->getSequenceInCore(p[i].index);
+ fprintf(file, ">%s\n", S->header());
+ fwrite(S->sequence(), sizeof(char), S->sequenceLength(), file);
+ fprintf(file, "\n");
+
+ if (S->sequenceLength() != p[i].length) {
+ fprintf(stderr, "Huh? '%s' "uint32FMT" != "uint32FMT"\n", S->header(), S->sequenceLength(), p[i].length);
+ }
+
+ delete S;
+ }
+
+ fclose(file);
+ }
+
+ } else {
+
+ // This dumps the partition information to stdout.
+ //
+ fprintf(stdout, uint32FMT"\n", openP);
+ for (uint32 o=1; o<=openP; o++) {
+ uint32 sizeP = 0;
+ for (uint32 i=0; i<n; i++)
+ if (p[i].partition == o)
+ sizeP += p[i].length;
+ fprintf(stdout, uint32FMT"]("uint32FMT")", o, sizeP);
+ for (uint32 i=0; i<n; i++)
+ if (p[i].partition == o)
+ fprintf(stdout, " "uint32FMT"("uint32FMT")", p[i].index, p[i].length);
+ fprintf(stdout, "\n");
+ }
+
+ }
+}
+
+
+void
+partitionBySize(char *prefix, uint64 partitionSize, char *filename) {
+ seqCache *F = new seqCache(filename);
+ uint32 n = F->getNumberOfSequences();
+ partition_s *p = loadPartition(F);
+
+ uint32 openP = 1; // Currently open partition
+ uint32 sizeP = 0; // Size of open partition
+ uint32 seqsP = n; // Number of sequences to partition
+
+ // For any sequences larger than partitionSize, create
+ // partitions containing just one sequence
+ //
+ for (uint32 i=0; i<n; i++) {
+ if (p[i].length > partitionSize) {
+ p[i].partition = openP++;
+ seqsP--;
+ }
+ }
+
+ // For the remaining, iterate through the list,
+ // greedily placing the longest sequence that fits
+ // into the open partition
+ //
+ while (seqsP > 0) {
+ for (uint32 i=0; i<n; i++) {
+ if ((p[i].partition == 0) &&
+ (p[i].length + sizeP < partitionSize)) {
+ p[i].partition = openP;
+ sizeP += p[i].length;
+ seqsP--;
+ }
+ }
+
+ openP++;
+ sizeP = 0;
+ }
+
+ outputPartition(F, prefix, p, openP-1, n);
+
+ delete [] p;
+ delete F;
+}
+
+
+void
+partitionByBucket(char *prefix, uint64 partitionSize, char *filename) {
+ seqCache *F = new seqCache(filename);
+ uint32 n = F->getNumberOfSequences();
+ partition_s *p = loadPartition(F);
+
+ if (partitionSize > n)
+ partitionSize = n;
+
+ // The size, in bases, of each partition
+ //
+ uint32 *s = new uint32 [partitionSize];
+ for (uint32 i=0; i<partitionSize; i++)
+ s[i] = 0;
+
+ // For each sequence
+ //
+ for (uint32 nextS=0; nextS<n; nextS++) {
+
+ // find the smallest partition
+ //
+ uint32 openP = 0;
+ for (uint32 i=0; i<partitionSize; i++)
+ if (s[i] < s[openP])
+ openP = i;
+
+ // add the next largest sequence to the open partition
+ //
+ s[openP] += p[nextS].length;
+ p[nextS].partition = openP+1;
+ }
+
+ outputPartition(F, prefix, p, (uint32)partitionSize, n);
+
+ delete [] p;
+ delete F;
+}
+
+
+void
+partitionBySegment(char *prefix, uint64 numSegments, char *filename) {
+ seqCache *F = new seqCache(filename);
+ uint32 n = F->getNumberOfSequences();
+ partition_s *p = new partition_s [n];
+ uint32 numSeqPerPart = (uint32)ceil(n / (double)numSegments);
+
+ for (uint32 i=0; i<n; i++) {
+ p[i].length = F->getSequenceLength(i);
+ p[i].index = i;
+ p[i].partition = i / numSeqPerPart + 1;
+ }
+
+ outputPartition(F, prefix, p, numSegments, n);
+
+ delete [] p;
+ delete F;
+}
diff --git a/leaff/simseq.C b/leaff/simseq.C
new file mode 100644
index 0000000..ced649b
--- /dev/null
+++ b/leaff/simseq.C
@@ -0,0 +1,227 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "bio.h"
+
+// This is Liliana Florea's sequencing error simulator. Bri hacked
+// it to use a real RNG, and to make it work from leaff.
+
+typedef struct edit_script {
+ int optype;
+ int num;
+ struct edit_script *next;
+} EditScript_t;
+
+typedef struct align {
+ int offset, len;
+ EditScript_t *script;
+} Align_t;
+
+
+
+// This guy is provided by leaff
+extern mt_s *mtctx;
+
+// RAND returns x numbers, starting at number y.
+//
+#define RAND(x,y) (int)((y) + (mtRandom32(mtctx) % (x)))
+
+#define max(x,y) ((x)>=(y) ? (x):(y))
+#define min(x,y) ((x)<=(y) ? (x):(y))
+
+#define MOV 3
+#define SUB 2
+#define INS 1
+#define DEL 0
+
+
+
+EditScript_t *
+new_script(int optype, int num, EditScript_t *next) {
+ EditScript_t *newtp = (EditScript_t *)malloc(sizeof(EditScript_t));
+
+ newtp->optype = optype;
+ newtp->num = num;
+ newtp->next = next;
+
+ return newtp;
+}
+
+
+
+/* DEL(pos), SUB(pos) - modifY position pos; INS - insert right before pos */
+void
+insert(Align_t *aln, int in_pos, int in_optype) {
+ int i, num, optype;
+ EditScript_t *t, *tp;
+
+ //fprintf(stderr, "Modify script op=%d pos=%d\n", in_optype, in_pos);
+
+ for (t=aln->script, i=0, tp=NULL; t; tp=t, t=t->next) {
+ num = t->num;
+ optype = t->optype;
+
+ switch (optype) {
+ case INS:
+ if (in_pos==i+1) {
+ if (tp)
+ tp->next = new_script(in_optype, 1, tp->next);
+ else
+ aln->script = new_script(in_optype, 1, aln->script);
+ return;
+ }
+ break;
+
+ case DEL:
+ i += num;
+ break;
+
+ case SUB:
+
+ case MOV:
+ if (i<in_pos && in_pos<=i+num) {
+ int l = (in_optype==INS) ? (in_pos-i) : (in_pos-i-1);
+ int r = (in_optype==INS) ? (num-l) : (num-l-1);
+ if (l && l!=num) {
+ t->num = l; tp = t;
+ tp->next = new_script(in_optype, 1, tp->next);
+ tp = tp->next;
+ tp->next = new_script(optype, r, tp->next);
+ } else if (!l) {
+ if (tp)
+ tp->next = new_script(in_optype, 1, t);
+ else
+ aln->script = new_script(in_optype, 1, aln->script);
+ if (in_optype!=INS) t->num -= 1;
+ } else {
+ tp = t;
+ tp->next = new_script(in_optype, 1, tp->next);
+ if (in_optype!=INS) t->num -= 1;
+ }
+ return;
+ }
+ i += num;
+ break;
+
+ default:
+ fprintf(stderr, "Unrecognized optype (%d).\n", in_optype);
+ break;
+ }
+ }
+
+ //fprintf(stderr, "Failed to modify sequence (%d,%d).\n", in_optype, in_pos);
+}
+
+
+void
+print_simseq(char *seq, char *hdr, Align_t *aln, double P, int CUT, int COPY) {
+ int k, e;
+ char *s;
+ char let_4[4] = {'A','C','G','T'};
+ char let_3A[3] = {'C','G','T'};
+ char let_3C[3] = {'A','G','T'};
+ char let_3G[3] = {'A','C','T'};
+ char let_3T[3] = {'A','C','G'};
+ EditScript_t *t;
+
+ fprintf(stdout, ">");
+
+ while ((*hdr) && !isspace(*hdr))
+ fprintf(stdout, "%c", *hdr++);
+
+ fprintf(stdout, ":seq=%d:copy=%d:loc=%d-%d:err=%1.2f\n", CUT+1, COPY+1, aln->offset, aln->offset+aln->len-1, P);
+
+ s = seq + aln->offset-1;
+
+ for (t=aln->script; t; t=t->next) {
+ if (*s == 0)
+ break;
+
+ switch (t->optype) {
+ case INS:
+ for (k=0; k<t->num; k++) {
+ e = RAND(4,0);
+ fprintf(stdout, "%c", let_4[e]);
+ }
+ break;
+
+ case DEL:
+ while (*s && t->num) {
+ s++;
+ t->num--;
+ }
+ break;
+
+ case SUB:
+ for (k=0; k<t->num; k++) {
+ e = RAND(3,0);
+ if (*s=='A') fprintf(stdout, "%c", let_3A[e]);
+ else if (*s=='C') fprintf(stdout, "%c", let_3C[e]);
+ else if (*s=='G') fprintf(stdout, "%c", let_3G[e]);
+ else if (*s=='T') fprintf(stdout, "%c", let_3T[e]);
+ else fprintf(stdout, "%c", 'A');
+ s++;
+ }
+ break;
+
+ case MOV:
+ for (k=0; k<t->num; k++) {
+ if (*s == 0) {
+ k = t->num;
+ } else {
+ fprintf(stdout, "%c", *s);
+ s++;
+ }
+ }
+ break;
+
+ default:
+ fprintf(stderr, "Unrecognized optype (%d).\n", t->optype);
+ break;
+ }
+ }
+ fprintf(stdout, "\n");
+}
+
+
+
+
+void
+simseq(char *seq, char *hdr, int len, int N, int L, int C, double P) {
+ Align_t align;
+ int i, j, k;
+ int start;
+ EditScript_t *s;
+
+ for (i=0; i<N; i++) {
+ /* generate a new sequence of length min(len,N) */
+ start = RAND((len-L+1),1);
+
+ /* now create in_C non-identical copies */
+ for (j=0; j<C; j++) {
+ /* generate a 'trivial' script for the sequence */
+
+ align.offset = start;
+ align.len = L;
+ align.script = new_script(MOV,L,NULL);
+
+ for (k=0; k<L*P; k++) {
+ int optype = RAND(3,0);
+ int pos = RAND(L,1);
+
+ insert(&align, pos, optype);
+ }
+
+ print_simseq(seq, hdr, &align, P, i, j);
+
+ while (align.script) {
+ s = align.script;
+ align.script = s->next;
+ free(s);
+ }
+ }
+ }
+}
diff --git a/leaff/stats.C b/leaff/stats.C
new file mode 100644
index 0000000..7ff9418
--- /dev/null
+++ b/leaff/stats.C
@@ -0,0 +1,126 @@
+#include "bio++.H"
+#include "seqCache.H"
+
+#include <algorithm>
+
+using namespace std;
+
+
+void
+stats(char *filename, uint64 refLen) {
+ seqCache *F = new seqCache(filename);
+
+ bool V[256];
+ for (uint32 i=0; i<256; i++)
+ V[i] = false;
+ V['n'] = true;
+ V['N'] = true;
+
+ uint32 numSeq = F->getNumberOfSequences();
+
+ uint64 Ss = 0; // actual length of span
+ uint64 Rs = 0; // reference length of span
+ uint32 *Ls = new uint32 [numSeq];
+
+ uint64 Sb = 0;
+ uint64 Rb = 0;
+ uint32 *Lb = new uint32 [numSeq];
+
+ for (uint32 i=0; i<numSeq; i++)
+ Ls[i] = Lb[i] = 0;
+
+ for (uint32 s=0; s<numSeq; s++) {
+ seqInCore *S = F->getSequenceInCore(s);
+ uint32 len = S->sequenceLength();
+ uint32 span = len;
+ uint32 base = len;
+
+ for (uint32 pos=1; pos<len; pos++) {
+ if (V[S->sequence()[pos]])
+ base--;
+ }
+
+ Ss += span;
+ Sb += base;
+
+ Ls[S->getIID()] = span;
+ Lb[S->getIID()] = base;
+
+ delete S;
+ }
+
+ if (refLen > 0) {
+ Rs = refLen;
+ Rb = refLen;
+ } else {
+ Rs = Ss;
+ Rb = Sb;
+ }
+
+ //qsort(Ls, numSeq, sizeof(uint32), uint32_compare);
+ //qsort(Lb, numSeq, sizeof(uint32), uint32_compare);
+
+ sort(Ls, Ls + numSeq);
+ sort(Lb, Lb + numSeq);
+
+ reverse(Ls, Ls + numSeq);
+ reverse(Lb, Lb + numSeq);
+
+ uint32 n50s[11] = {0};
+ uint32 l50s[11] = {0};
+
+ uint32 n50b[11] = {0};
+ uint32 l50b[11] = {0};
+
+ uint32 sizes[11] = {0};
+ uint32 sizeb[11] = {0};
+
+ for (uint32 i=0; i<11; i++) {
+ sizes[i] = i * Rs / 10;
+ sizeb[i] = i * Rb / 10;
+ //fprintf(stderr, "SIZE %2d s=%d b=%d\n", i, sizes[i], sizeb[i]);
+ }
+
+ for (uint32 i=0, sum=0, n=1; (i < numSeq) && (n < 11); i++) {
+ if ((sum < sizes[n]) && (sizes[n] <= sum + Ls[i])) {
+ n50s[n] = Ls[i];
+ l50s[n] = i;
+ n++;
+ }
+
+ sum += Ls[i];
+ }
+
+
+ for (uint32 i=0, sum=0, n=1; (i < numSeq) && (n < 11); i++) {
+ if ((sum < sizeb[n]) && (sizeb[n] <= sum + Lb[i])) {
+ n50b[n] = Ls[i];
+ l50b[n] = i;
+ n++;
+ }
+
+ sum += Lb[i];
+ }
+
+ //for (uint32 i=0, sum=0; sum < Rb/2; i++) {
+ //}
+
+ fprintf(stdout, "%s\n", F->getSourceName());
+ fprintf(stdout, "\n");
+ fprintf(stdout, "numSeqs "uint32FMT"\n", numSeq);
+ fprintf(stdout, "\n");
+ fprintf(stdout, "SPAN (smallest "uint32FMT" largest "uint32FMT")\n", Ls[numSeq-1], Ls[0]);
+ for (uint32 i=1; i<10; i++)
+ fprintf(stdout, "n"uint32FMT" "uint32FMT" at index "uint32FMT"\n", 10 * i, n50s[i], l50s[i]);
+ fprintf(stdout, "totLen "uint64FMTW(10)"\n", Ss);
+ fprintf(stdout, "refLen "uint64FMTW(10)"\n", Rs);
+ fprintf(stdout, "\n");
+ fprintf(stdout, "BASES (smallest "uint32FMT" largest "uint32FMT")\n", Lb[numSeq-1], Lb[0]);
+ for (uint32 i=1; i<10; i++)
+ fprintf(stdout, "n"uint32FMT" "uint32FMT" at index "uint32FMT"\n", 10 * i, n50b[i], l50b[i]);
+ fprintf(stdout, "totLen "uint64FMTW(10)"\n", Sb);
+ fprintf(stdout, "refLen "uint64FMTW(10)"\n", Rb);
+
+ delete [] Ls;
+ delete [] Lb;
+}
diff --git a/libbio/Make.include b/libbio/Make.include
new file mode 100644
index 0000000..581274c
--- /dev/null
+++ b/libbio/Make.include
@@ -0,0 +1,65 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+
+src := $/alphabet.c \
+ $/alphabet.h \
+ $/alphabet-acgtspace.c \
+ $/alphabet-colorspace.c \
+ $/bio++.H \
+ $/bio.h \
+ $/halign.c \
+ $/kmer.C \
+ $/kmer.H \
+ $/kmerhuge.H \
+ $/kmeriface.H \
+ $/kmertiny.H \
+ $/merCovering.H \
+ $/merList.H \
+ $/mers.h \
+ $/reversecomplement.c
+
+old := $/fasta-accessor.H \
+ $/fasta-c.C \
+ $/fasta-c.h \
+ $/fasta-cache.C \
+ $/fasta-cache.H \
+ $/fasta-simple.c \
+ $/fasta-simple.h \
+ $/fasta.C \
+ $/fasta.H \
+ $/merstream.C \
+ $/merstream.H \
+ $/seq.C \
+ $/seq.H \
+ $/seqFactory.H \
+ $/seqFile.H \
+ $/seqInCore.H \
+ $/seqOnDisk.H \
+ $/seqStore.H \
+ $/seqStream.H \
+ $/sff.H \
+ $/sff.C
+
+
+$/.C_SRCS :=$(filter %.c,${src})
+$/.C_INCS :=$(filter %.h,${src})
+$/.CXX_SRCS :=$(filter %.C,${src})
+$/.CXX_INCS :=$(filter %.H,${src})
+$/.CXX_LIBS :=$/libbio.a
+
+$/.CLEAN := $/*.o
+
+$(eval $/%.d $/%.o: CFLAGS += -I${LIBUTL/})
+$(eval $/%.d $/%.o: CXXFLAGS += -I${LIBUTL/})
+
+$/reversecomplement.c.d: $/alphabet.h
+$/merstream.C.d: $/alphabet.h
+
+$/libbio.a: ${$/.C_SRCS:.c=.o} ${$/.CXX_SRCS:.C=.o} $/alphabet.o
+
+$/alphabet.c: $/alphabet.h
+$/alphabet.h: $/alphabet-generate.c $/alphabet-acgtspace.c $/alphabet-colorspace.c
+ $(CC) $(CFLAGS) $(CFLAGS_COMPILE) -o `dirname $@`/a.out $<
+ cd `dirname $@` ; ./a.out && rm -f ./a.out
diff --git a/libbio/alphabet-acgtspace.c b/libbio/alphabet-acgtspace.c
new file mode 100644
index 0000000..b8f73d2
--- /dev/null
+++ b/libbio/alphabet-acgtspace.c
@@ -0,0 +1,149 @@
+#include <stdio.h>
+#include <ctype.h>
+#include "alphabet.h"
+
+void
+initCompressionTablesForACGTSpace(void) {
+ int i, j;
+
+ for (i=0; i<256; i++) {
+ whitespaceSymbol[i] = isspace(i) ? 1 : 0;
+ toLower[i] = tolower(i);
+ toUpper[i] = toupper(i);
+ letterToBits[i] = (unsigned char)0xff;
+ bitsToLetter[i] = (unsigned char)'?';
+ bitsToColor[i] = (unsigned char)'?';
+ complementSymbol[i] = (unsigned char)'?';
+ }
+
+ for (i=0; i<128; i++)
+ for (j=0; j<128; j++)
+ IUPACidentity[i][j] = 0;
+
+ letterToBits['a'] = letterToBits['A'] = (unsigned char)0x00;
+ letterToBits['c'] = letterToBits['C'] = (unsigned char)0x01;
+ letterToBits['g'] = letterToBits['G'] = (unsigned char)0x02;
+ letterToBits['t'] = letterToBits['T'] = (unsigned char)0x03;
+
+ letterToBits['0'] = (unsigned char)0x00;
+ letterToBits['1'] = (unsigned char)0x01;
+ letterToBits['2'] = (unsigned char)0x02;
+ letterToBits['3'] = (unsigned char)0x03;
+
+ bitsToLetter[0x00] = 'A';
+ bitsToLetter[0x01] = 'C';
+ bitsToLetter[0x02] = 'G';
+ bitsToLetter[0x03] = 'T';
+
+ bitsToColor[0x00] = '0';
+ bitsToColor[0x01] = '1';
+ bitsToColor[0x02] = '2';
+ bitsToColor[0x03] = '3';
+
+ complementSymbol['a'] = 't'; // a
+ complementSymbol['t'] = 'a'; // t
+ complementSymbol['u'] = 'a'; // u, Really, only for RNA
+ complementSymbol['g'] = 'c'; // g
+ complementSymbol['c'] = 'g'; // c
+ complementSymbol['y'] = 'r'; // c t
+ complementSymbol['r'] = 'y'; // a g
+ complementSymbol['s'] = 'w'; // g c
+ complementSymbol['w'] = 's'; // a t
+ complementSymbol['k'] = 'm'; // t/u g
+ complementSymbol['m'] = 'k'; // a c
+ complementSymbol['b'] = 'v'; // c g t
+ complementSymbol['d'] = 'h'; // a g t
+ complementSymbol['h'] = 'd'; // a c t
+ complementSymbol['v'] = 'b'; // a c g
+ complementSymbol['n'] = 'n'; // a c g t
+
+ complementSymbol['A'] = 'T'; // a
+ complementSymbol['T'] = 'A'; // t
+ complementSymbol['U'] = 'A'; // u, Really, only for RNA
+ complementSymbol['G'] = 'C'; // g
+ complementSymbol['C'] = 'G'; // c
+ complementSymbol['Y'] = 'R'; // c t
+ complementSymbol['R'] = 'Y'; // a g
+ complementSymbol['S'] = 'W'; // g c
+ complementSymbol['W'] = 'S'; // a t
+ complementSymbol['K'] = 'M'; // t/u g
+ complementSymbol['M'] = 'K'; // a c
+ complementSymbol['B'] = 'V'; // c g t
+ complementSymbol['D'] = 'H'; // a g t
+ complementSymbol['H'] = 'D'; // a c t
+ complementSymbol['V'] = 'B'; // a c g
+ complementSymbol['N'] = 'N'; // a c g t
+
+ complementSymbol['0'] = '0'; // ColorSpace is self-complementing
+ complementSymbol['1'] = '1';
+ complementSymbol['2'] = '2';
+ complementSymbol['3'] = '3';
+
+ IUPACidentity['A']['A'] = 1;
+ IUPACidentity['C']['C'] = 1;
+ IUPACidentity['G']['G'] = 1;
+ IUPACidentity['T']['T'] = 1;
+ IUPACidentity['M']['A'] = 1;
+ IUPACidentity['M']['C'] = 1;
+ IUPACidentity['R']['A'] = 1;
+ IUPACidentity['R']['G'] = 1;
+ IUPACidentity['W']['A'] = 1;
+ IUPACidentity['W']['T'] = 1;
+ IUPACidentity['S']['C'] = 1;
+ IUPACidentity['S']['G'] = 1;
+ IUPACidentity['Y']['C'] = 1;
+ IUPACidentity['Y']['T'] = 1;
+ IUPACidentity['K']['G'] = 1;
+ IUPACidentity['K']['T'] = 1;
+ IUPACidentity['V']['A'] = 1;
+ IUPACidentity['V']['C'] = 1;
+ IUPACidentity['V']['G'] = 1;
+ IUPACidentity['H']['A'] = 1;
+ IUPACidentity['H']['C'] = 1;
+ IUPACidentity['H']['T'] = 1;
+ IUPACidentity['D']['A'] = 1;
+ IUPACidentity['D']['G'] = 1;
+ IUPACidentity['D']['T'] = 1;
+ IUPACidentity['B']['C'] = 1;
+ IUPACidentity['B']['G'] = 1;
+ IUPACidentity['B']['T'] = 1;
+
+ IUPACidentity['N']['A'] = 1;
+ IUPACidentity['N']['C'] = 1;
+ IUPACidentity['N']['G'] = 1;
+ IUPACidentity['N']['T'] = 1;
+
+ IUPACidentity['M']['M'] = 1;
+ IUPACidentity['R']['R'] = 1;
+ IUPACidentity['W']['W'] = 1;
+ IUPACidentity['S']['S'] = 1;
+ IUPACidentity['Y']['Y'] = 1;
+ IUPACidentity['K']['K'] = 1;
+ IUPACidentity['V']['V'] = 1;
+ IUPACidentity['H']['W'] = 1;
+ IUPACidentity['D']['D'] = 1;
+ IUPACidentity['B']['B'] = 1;
+ IUPACidentity['N']['N'] = 1;
+
+ // Order isn't important
+ //
+ for (i='A'; i<'Z'; i++)
+ for (j='A'; j<'Z'; j++) {
+ if (IUPACidentity[j][i])
+ IUPACidentity[i][j] = 1;
+ }
+
+ // Case isn't important
+ //
+ for (i='A'; i<'Z'; i++)
+ for (j='A'; j<'Z'; j++) {
+ if (IUPACidentity[j][i]) {
+ IUPACidentity[tolower(i)][tolower(j)] = 1;
+ IUPACidentity[tolower(i)][j ] = 1;
+ IUPACidentity[i ][tolower(j)] = 1;
+ }
+ }
+}
+
+
+
diff --git a/libbio/alphabet-colorspace.c b/libbio/alphabet-colorspace.c
new file mode 100644
index 0000000..98d28f6
--- /dev/null
+++ b/libbio/alphabet-colorspace.c
@@ -0,0 +1,121 @@
+#include <stdio.h>
+#include <ctype.h>
+#include "alphabet.h"
+
+void
+initCompressionTablesForColorSpace(void) {
+ int i, j;
+
+ for (i=0; i<128; i++)
+ for (j=0; j<128; j++)
+ baseToColor[i][j] = '.'; // Invalid
+
+ // Supports transforming a base sequence to a color sequence.
+
+ // Not sure how valid this is; treat every letter like it's a gap.
+ // We then override ACGT to be the correct encoding.
+ for (i='a'; i<='z'; i++) {
+ baseToColor['a'][i] = '4';
+ baseToColor['c'][i] = '4';
+ baseToColor['g'][i] = '4';
+ baseToColor['t'][i] = '4';
+ baseToColor['n'][i] = '4';
+ }
+ for (i='a'; i<='z'; i++) {
+ baseToColor[i]['a'] = '0';
+ baseToColor[i]['c'] = '1';
+ baseToColor[i]['g'] = '2';
+ baseToColor[i]['t'] = '3';
+ baseToColor[i]['n'] = '4';
+ }
+
+ baseToColor['a']['a'] = '0';
+ baseToColor['a']['c'] = '1';
+ baseToColor['a']['g'] = '2';
+ baseToColor['a']['t'] = '3';
+ baseToColor['a']['n'] = '4';
+
+ baseToColor['c']['a'] = '1';
+ baseToColor['c']['c'] = '0';
+ baseToColor['c']['g'] = '3';
+ baseToColor['c']['t'] = '2';
+ baseToColor['c']['n'] = '4';
+
+ baseToColor['g']['a'] = '2';
+ baseToColor['g']['c'] = '3';
+ baseToColor['g']['g'] = '0';
+ baseToColor['g']['t'] = '1';
+ baseToColor['g']['n'] = '4';
+
+ baseToColor['t']['a'] = '3';
+ baseToColor['t']['c'] = '2';
+ baseToColor['t']['g'] = '1';
+ baseToColor['t']['t'] = '0';
+ baseToColor['t']['n'] = '4';
+
+ for (i='a'; i<='z'; i++)
+ for (j='a'; j<='z'; j++) {
+ baseToColor[toupper(i)][toupper(j)] = baseToColor[i][j];
+ baseToColor[tolower(i)][toupper(j)] = baseToColor[i][j];
+ baseToColor[toupper(i)][tolower(j)] = baseToColor[i][j];
+ baseToColor[tolower(i)][tolower(j)] = baseToColor[i][j];
+ }
+
+ // Supports composing colors
+
+ baseToColor['0']['0'] = '0';
+ baseToColor['0']['1'] = '1';
+ baseToColor['0']['2'] = '2';
+ baseToColor['0']['3'] = '3';
+ baseToColor['0']['4'] = '4';
+
+ baseToColor['1']['0'] = '1';
+ baseToColor['1']['1'] = '0';
+ baseToColor['1']['2'] = '3';
+ baseToColor['1']['3'] = '2';
+ baseToColor['1']['4'] = '4';
+
+ baseToColor['2']['0'] = '2';
+ baseToColor['2']['1'] = '3';
+ baseToColor['2']['2'] = '0';
+ baseToColor['2']['3'] = '1';
+ baseToColor['2']['4'] = '4';
+
+ baseToColor['3']['0'] = '3';
+ baseToColor['3']['1'] = '2';
+ baseToColor['3']['2'] = '1';
+ baseToColor['3']['3'] = '0';
+ baseToColor['3']['4'] = '4';
+
+ // Supports transforming color sequence to base sequence.
+
+ baseToColor['a']['0'] = baseToColor['A']['0'] = 'a';
+ baseToColor['a']['1'] = baseToColor['A']['1'] = 'c';
+ baseToColor['a']['2'] = baseToColor['A']['2'] = 'g';
+ baseToColor['a']['3'] = baseToColor['A']['3'] = 't';
+ baseToColor['a']['4'] = baseToColor['A']['4'] = 'n';
+
+ baseToColor['c']['0'] = baseToColor['C']['0'] = 'c';
+ baseToColor['c']['1'] = baseToColor['C']['1'] = 'a';
+ baseToColor['c']['2'] = baseToColor['C']['2'] = 't';
+ baseToColor['c']['3'] = baseToColor['C']['3'] = 'g';
+ baseToColor['c']['4'] = baseToColor['C']['4'] = 'n';
+
+ baseToColor['g']['0'] = baseToColor['G']['0'] = 'g';
+ baseToColor['g']['1'] = baseToColor['G']['1'] = 't';
+ baseToColor['g']['2'] = baseToColor['G']['2'] = 'a';
+ baseToColor['g']['3'] = baseToColor['G']['3'] = 'c';
+ baseToColor['g']['4'] = baseToColor['G']['4'] = 'n';
+
+ baseToColor['t']['0'] = baseToColor['T']['0'] = 't';
+ baseToColor['t']['1'] = baseToColor['T']['1'] = 'g';
+ baseToColor['t']['2'] = baseToColor['T']['2'] = 'c';
+ baseToColor['t']['3'] = baseToColor['T']['3'] = 'a';
+ baseToColor['t']['4'] = baseToColor['T']['4'] = 'n';
+
+ baseToColor['n']['0'] = baseToColor['N']['0'] = 'a';
+ baseToColor['n']['1'] = baseToColor['N']['1'] = 'c';
+ baseToColor['n']['2'] = baseToColor['N']['2'] = 'g';
+ baseToColor['n']['3'] = baseToColor['N']['3'] = 't';
+ baseToColor['n']['4'] = baseToColor['N']['4'] = 'n';
+}
diff --git a/libbio/alphabet-generate.c b/libbio/alphabet-generate.c
new file mode 100644
index 0000000..20b2dc2
--- /dev/null
+++ b/libbio/alphabet-generate.c
@@ -0,0 +1,134 @@
+#include <stdio.h>
+#include <ctype.h>
+
+// Instead of forcing client applications to explicitly call
+// initCompressionTables(), static tables are now generated.
+
+unsigned char whitespaceSymbol[256];
+unsigned char toLower[256];
+unsigned char toUpper[256];
+
+unsigned char letterToBits[256];
+unsigned char bitsToLetter[256];
+unsigned char bitsToColor[256];
+
+unsigned char complementSymbol[256];
+unsigned char validCompressedSymbol[256];
+
+unsigned char IUPACidentity[128][128];
+unsigned char baseToColor[128][128];
+
+void initCompressionTablesForACGTSpace(void);
+void initCompressionTablesForColorSpace(void);
+
+#include "alphabet-acgtspace.c"
+#include "alphabet-colorspace.c"
+
+int
+main(int argc, char **argv) {
+ int i, j;
+
+ FILE *C = fopen("alphabet.c", "w");
+ FILE *H = fopen("alphabet.h", "w");
+
+ initCompressionTablesForACGTSpace();
+ initCompressionTablesForColorSpace();
+
+ fprintf(H, "//\n");
+ fprintf(H, "// Automagically generated -- DO NOT EDIT!\n");
+ fprintf(H, "// See libbri/alphabet-generate.c for details.\n");
+ fprintf(H, "//\n");
+ fprintf(H, "\n");
+ fprintf(H, "#ifdef __cplusplus\n");
+ fprintf(H, "extern \"C\" {\n");
+ fprintf(H, "#endif\n");
+ fprintf(H, "\n");
+
+ fprintf(C, "//\n");
+ fprintf(C, "// Automagically generated -- DO NOT EDIT!\n");
+ fprintf(C, "// See %s for details.\n", __FILE__);
+ fprintf(C, "//\n");
+
+ fprintf(H, "extern unsigned char whitespaceSymbol[256];\n");
+ fprintf(C, "unsigned char whitespaceSymbol[256] = { %d", whitespaceSymbol[0]);
+ for (i=1; i<256; i++)
+ fprintf(C, ",%d", whitespaceSymbol[i]);
+ fprintf(C, " };\n");
+
+ fprintf(H, "extern unsigned char toLower[256];\n");
+ fprintf(C, "unsigned char toLower[256] = { %d", toLower[0]);
+ for (i=1; i<256; i++)
+ fprintf(C, ",%d", toLower[i]);
+ fprintf(C, " };\n");
+
+ fprintf(H, "extern unsigned char toUpper[256];\n");
+ fprintf(C, "unsigned char toUpper[256] = { %d", toUpper[0]);
+ for (i=1; i<256; i++)
+ fprintf(C, ",%d", toUpper[i]);
+ fprintf(C, " };\n");
+
+ fprintf(H, "extern unsigned char letterToBits[256];\n");
+ fprintf(C, "unsigned char letterToBits[256] = { %d", letterToBits[0]);
+ for (i=1; i<256; i++)
+ fprintf(C, ",%d", letterToBits[i]);
+ fprintf(C, " };\n");
+
+ fprintf(H, "extern unsigned char bitsToLetter[256];\n");
+ fprintf(C, "unsigned char bitsToLetter[256] = { %d", bitsToLetter[0]);
+ for (i=1; i<256; i++)
+ fprintf(C, ",%d", bitsToLetter[i]);
+ fprintf(C, " };\n");
+
+ fprintf(H, "extern unsigned char bitsToColor[256];\n");
+ fprintf(C, "unsigned char bitsToColor[256] = { %d", bitsToColor[0]);
+ for (i=1; i<256; i++)
+ fprintf(C, ",%d", bitsToColor[i]);
+ fprintf(C, " };\n");
+
+ fprintf(H, "extern unsigned char complementSymbol[256];\n");
+ fprintf(C, "unsigned char complementSymbol[256] = { %d", complementSymbol[0]);
+ for (i=1; i<256; i++)
+ fprintf(C, ",%d", complementSymbol[i]);
+ fprintf(C, " };\n");
+
+ fprintf(H, "extern unsigned char IUPACidentity[128][128];\n");
+ fprintf(C, "unsigned char IUPACidentity[128][128] = {\n");
+ for (i=0; i<128; i++) {
+ fprintf(C, " {");
+ if (IUPACidentity[i][0])
+ fprintf(C, "1");
+ else
+ fprintf(C, "0");
+ for (j=1;j<128; j++) {
+ if (IUPACidentity[i][j])
+ fprintf(C, ",1");
+ else
+ fprintf(C, ",0");
+ }
+ fprintf(C, "},\n");
+ }
+ fprintf(C, "};\n");
+
+
+ fprintf(H, "extern unsigned char baseToColor[128][128];\n");
+ fprintf(C, "unsigned char baseToColor[128][128] = {\n");
+ for (i=0; i<128; i++) {
+ fprintf(C, " {%d", baseToColor[i][0]);
+ for (j=1;j<128; j++)
+ fprintf(C, ",%d", baseToColor[i][j]);
+ fprintf(C, "},\n");
+ }
+ fprintf(C, "};\n");
+
+
+ fprintf(H, "\n");
+ fprintf(H, "void initCompressionTablesForACGTSpace(void);\n");
+ fprintf(H, "void initCompressionTablesForColorSpace(void);\n");
+
+ fprintf(H, "\n");
+ fprintf(H, "#ifdef __cplusplus\n");
+ fprintf(H, "}\n");
+ fprintf(H, "#endif\n");
+
+ return(0);
+}
diff --git a/libbio/alphabet.c b/libbio/alphabet.c
new file mode 100644
index 0000000..2fbe1e0
--- /dev/null
+++ b/libbio/alphabet.c
@@ -0,0 +1,271 @@
+//
+// Automagically generated -- DO NOT EDIT!
+// See ../kmer/libbio/alphabet-generate.c for details.
+//
+unsigned char whitespaceSymbol[256] = { 0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, [...]
+unsigned char toLower[256] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,13 [...]
+unsigned char toUpper[256] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147, [...]
+unsigned char letterToBits[256] = { 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,1,2,3,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,1,255,255,255,2,255,255,255,255,255,255,255,255,255,255,255,255,3,255,255,255,255,255,255,255,255,255,255,255,255,0,255,1,255,255,255,2,255,255,255,255,255,255,255,255,255,255,255,255,3,255,255,25 [...]
+unsigned char bitsToLetter[256] = { 65,67,71,84,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,6 [...]
+unsigned char bitsToColor[256] = { 48,49,50,51,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63 [...]
+unsigned char complementSymbol[256] = { 63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,48,49,50,51,63,63,63,63,63,63,63,63,63,63,63,63,63,84,86,71,72,63,63,67,68,63,63,77,63,75,78,63,63,63,89,87,65,65,66,83,63,82,63,63,63,63,63,63,63,116,118,103,104,63,63,99,100,63,63,109,63,107,110,63,63,63,121,119,97,97,98,115,63,114,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63, [...]
+unsigned char IUPACidentity[128][128] = {
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+};
+unsigned char baseToColor[128][128] = {
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,49,50,51,52,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,49,48,51,50,52,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,50,51,48,49,52,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,51,50,49,48,52,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,97,99,103,116,110,46,46,46,46,46,46,46,46,46,46,46,46,48,52,49,52,52,52,50,52,52,52,52,52,52,52,52,52,52,52,52,51,52,52,52,52,52,52,46,46,46,46,46,46,48,52,49,52,52,52,50,52,52,52,52,52,52,52,52,52,52,52,52,51,52,52,52,52,52,52,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,99,97,116,103,110,46,46,46,46,46,46,46,46,46,46,46,46,49,52,48,52,52,52,51,52,52,52,52,52,52,52,52,52,52,52,52,50,52,52,52,52,52,52,46,46,46,46,46,46,49,52,48,52,52,52,51,52,52,52,52,52,52,52,52,52,52,52,52,50,52,52,52,52,52,52,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,103,116,97,99,110,46,46,46,46,46,46,46,46,46,46,46,46,50,52,51,52,52,52,48,52,52,52,52,52,52,52,52,52,52,52,52,49,52,52,52,52,52,52,46,46,46,46,46,46,50,52,51,52,52,52,48,52,52,52,52,52,52,52,52,52,52,52,52,49,52,52,52,52,52,52,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,97,99,103,116,110,46,46,46,46,46,46,46,46,46,46,46,46,48,52,49,52,52,52,50,52,52,52,52,52,52,52,52,52,52,52,52,51,52,52,52,52,52,52,46,46,46,46,46,46,48,52,49,52,52,52,50,52,52,52,52,52,52,52,52,52,52,52,52,51,52,52,52,52,52,52,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,116,103,99,97,110,46,46,46,46,46,46,46,46,46,46,46,46,51,52,50,52,52,52,49,52,52,52,52,52,52,52,52,52,52,52,52,48,52,52,52,52,52,52,46,46,46,46,46,46,51,52,50,52,52,52,49,52,52,52,52,52,52,52,52,52,52,52,52,48,52,52,52,52,52,52,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,97,99,103,116,110,46,46,46,46,46,46,46,46,46,46,46,46,48,52,49,52,52,52,50,52,52,52,52,52,52,52,52,52,52,52,52,51,52,52,52,52,52,52,46,46,46,46,46,46,48,52,49,52,52,52,50,52,52,52,52,52,52,52,52,52,52,52,52,51,52,52,52,52,52,52,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,99,97,116,103,110,46,46,46,46,46,46,46,46,46,46,46,46,49,52,48,52,52,52,51,52,52,52,52,52,52,52,52,52,52,52,52,50,52,52,52,52,52,52,46,46,46,46,46,46,49,52,48,52,52,52,51,52,52,52,52,52,52,52,52,52,52,52,52,50,52,52,52,52,52,52,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,103,116,97,99,110,46,46,46,46,46,46,46,46,46,46,46,46,50,52,51,52,52,52,48,52,52,52,52,52,52,52,52,52,52,52,52,49,52,52,52,52,52,52,46,46,46,46,46,46,50,52,51,52,52,52,48,52,52,52,52,52,52,52,52,52,52,52,52,49,52,52,52,52,52,52,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,97,99,103,116,110,46,46,46,46,46,46,46,46,46,46,46,46,48,52,49,52,52,52,50,52,52,52,52,52,52,52,52,52,52,52,52,51,52,52,52,52,52,52,46,46,46,46,46,46,48,52,49,52,52,52,50,52,52,52,52,52,52,52,52,52,52,52,52,51,52,52,52,52,52,52,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,116,103,99,97,110,46,46,46,46,46,46,46,46,46,46,46,46,51,52,50,52,52,52,49,52,52,52,52,52,52,52,52,52,52,52,52,48,52,52,52,52,52,52,46,46,46,46,46,46,51,52,50,52,52,52,49,52,52,52,52,52,52,52,52,52,52,52,52,48,52,52,52,52,52,52,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46,46,48,46,49,46,46,46,50,46,46,46,46,46,46,52,46,46,46,46,46,51,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+ {46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46},
+};
diff --git a/libbio/alphabet.h b/libbio/alphabet.h
new file mode 100644
index 0000000..4243a9a
--- /dev/null
+++ b/libbio/alphabet.h
@@ -0,0 +1,25 @@
+//
+// Automagically generated -- DO NOT EDIT!
+// See libbri/alphabet-generate.c for details.
+//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern unsigned char whitespaceSymbol[256];
+extern unsigned char toLower[256];
+extern unsigned char toUpper[256];
+extern unsigned char letterToBits[256];
+extern unsigned char bitsToLetter[256];
+extern unsigned char bitsToColor[256];
+extern unsigned char complementSymbol[256];
+extern unsigned char IUPACidentity[128][128];
+extern unsigned char baseToColor[128][128];
+
+void initCompressionTablesForACGTSpace(void);
+void initCompressionTablesForColorSpace(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/libbio/bio++.H b/libbio/bio++.H
new file mode 100644
index 0000000..ceeca6c
--- /dev/null
+++ b/libbio/bio++.H
@@ -0,0 +1,23 @@
+#ifndef BIO_PLUS_PLUS_H
+#define BIO_PLUS_PLUS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <ctype.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "bio.h"
+#include "util++.H"
+
+#include "mers.h"
+#include "kmer.H"
+#include "merCovering.H"
+#include "merList.H"
+
+#endif // BIO_PLUS_PLUS_H
+
diff --git a/libbio/bio.h b/libbio/bio.h
new file mode 100644
index 0000000..799efec
--- /dev/null
+++ b/libbio/bio.h
@@ -0,0 +1,47 @@
+#ifndef BIO_H
+#define BIO_H
+
+#include "util.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+////////////////////////////////////////
+//
+// alphabet
+//
+#include "alphabet.h"
+
+
+////////////////////////////////////////
+//
+// reversecomplement.c
+//
+char *reverseComplementSequence(char *seq, uint32 seqlen);
+char *reverseString(char *seq, uint32 seqlen);
+
+
+// halign
+//
+// N.B. align() (aka halign) was switched over to palloc() -- this
+// fixed any memory leaks, and gives a 30%-ish speed increase. This
+// is thread safe (unless someone breaks palloc2()).
+//
+void
+halign(const char *string1,
+ const char *string2,
+ const int len1,
+ const int len2,
+ char *alnline1,
+ char *alnline2);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif // BIO_H
diff --git a/libbio/halign.c b/libbio/halign.c
new file mode 100644
index 0000000..86aeb89
--- /dev/null
+++ b/libbio/halign.c
@@ -0,0 +1,467 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <limits.h>
+
+#include "bio.h"
+
+// Liliana Florea's halign (a sim4-derivitive).
+
+#define DEL 0
+#define INS 1
+#define SUB 2
+
+#ifdef min
+#undef min
+#endif
+#define min(x,y) ((x)<=(y) ? (x):(y))
+
+#ifdef max
+#undef max
+#endif
+#define max(x,y) ((x)>=(y) ? (x):(y))
+
+
+typedef struct edit_script {
+ int op_type; /* SUB, INS or DEL */
+ int num; /* Number of operations */
+ struct edit_script *next;
+} edit_script;
+
+typedef struct edit_script_list {
+ int offset1, offset2;
+ int len1, len2;
+ int score;
+ int first;
+ edit_script *script;
+} edit_script_list;
+
+
+
+static
+int
+snake(const char *seq1, const char *seq2, int k, int x, int endx, int endy) {
+ int y;
+
+ if (x<0) return x;
+ y = x+k;
+ while ((x < endx) &&
+ (y < endy) &&
+ (toUpper[seq1[x]] == toUpper[seq2[y]])) {
+ ++x;
+ ++y;
+ }
+ return x;
+}
+
+
+static
+int
+rsnake(const char *seq1, const char *seq2, int k, int x, int startx, int starty, int M, int N) {
+ int y;
+
+ if (x>M) return x;
+ if ((startx<0) || (starty<0))
+ fprintf(stderr, "halign::rsnake()-- TROUBLE!!! startx: %5d, starty: %5d\n",startx, starty);
+ if ((x>M) || (x+k>N))
+ fprintf(stderr, "halign::rsnake()-- TROUBLE!!! x: %5d, y: %5d\n",x,x+k);
+
+ y = x+k;
+ while ((x>startx) &&
+ (y>starty) &&
+ (toUpper[seq1[x-1]] == toUpper[seq2[y-1]])) {
+ --x;
+ --y;
+ }
+ return x;
+}
+
+
+static
+int
+align_get_dist(const char *seq1,
+ const char *seq2,
+ int i1, int j1,
+ int i2, int j2,
+ int limit,
+ void *ph) {
+ int *last_d, *temp_d;
+ int goal_diag, ll, uu;
+ int c, k, row;
+ int start;
+ int lower, upper;
+
+ /* Compute the boundary diagonals */
+ start = j1 - i1;
+ lower = max(j1-i2, start-limit);
+ upper = min(j2-i1, start+limit);
+ goal_diag = j2-i2;
+
+ if (goal_diag > upper || goal_diag < lower) {
+ fprintf(stderr, "The two sequences are not really similar.\n");
+ fprintf(stderr, "Please try an exact aligning method.\n");
+ exit(1);
+ }
+
+ /* Allocate space for forward vectors */
+ last_d = (int *)palloc2((upper-lower+1)*sizeof(int), ph) - lower;
+ temp_d = (int *)palloc2((upper-lower+1)*sizeof(int), ph) - lower;
+
+ /* Initialization */
+ for (k=lower; k<=upper; ++k) last_d[k] = INT_MIN;
+ last_d[start] = snake(seq1, seq2, start, i1, i2, j2);
+
+ if (last_d[goal_diag] >= i2)
+ return 0;
+
+ for (c=1; c<=limit; ++c) {
+ ll = max(lower,start-c); uu = min(upper, start+c);
+ for (k=ll; k<=uu; ++k) {
+ if (k == ll)
+ row = last_d[k+1]+1; /* DELETE */
+ else if (k == uu)
+ row = last_d[k-1]; /* INSERT */
+ else if ((last_d[k]>=last_d[k+1]) &&
+ (last_d[k]+1>=last_d[k-1]))
+ row = last_d[k]+1; /*SUBSTITUTE */
+ else if ((last_d[k+1]+1>=last_d[k-1]) &&
+ (last_d[k+1]>=last_d[k]))
+ row = last_d[k+1]+1; /* DELETE */
+ else
+ row = last_d[k-1]; /* INSERT */
+
+ temp_d[k] = snake(seq1,seq2,k,row,i2,j2);
+ }
+
+ for (k=ll; k<=uu; ++k) last_d[k] = temp_d[k];
+
+ if (last_d[goal_diag] >= i2)
+ return c;
+ }
+
+ /* Ran out of distance limit */
+ return -1;
+}
+
+
+static
+void
+align_path(const char *seq1,
+ const char *seq2,
+ int i1, int j1,
+ int i2, int j2,
+ int dist,
+ edit_script **head,
+ edit_script **tail,
+ int M,
+ int N,
+ void *ph) {
+
+ int *last_d, *temp_d; /* forward vectors */
+ int *rlast_d, *rtemp_d; /* backward vectors */
+
+ edit_script *head1, *tail1, *head2, *tail2;
+ int midc, rmidc;
+ int start;
+ int lower, upper;
+ int rstart, rlower, rupper;
+ int c, k, row;
+ int mi, mj, tmp, ll, uu;
+ char flag;
+
+ *head = *tail = NULL;
+
+ /* Boundary cases */
+ if (i1 == i2) {
+ if (j1 == j2) *head = NULL;
+ else {
+ head1 = (edit_script *)palloc2(sizeof(edit_script), ph);
+ head1->op_type = INS;
+ head1->num = j2-j1;
+ head1->next = NULL;
+ *head = *tail = head1;
+ }
+ return;
+ }
+
+ if (j1 == j2) {
+ head1 = (edit_script *)palloc2(sizeof(edit_script), ph);
+ head1->op_type = DEL;
+ head1->num = i2-i1;
+ head1->next = NULL;
+ *head = *tail = head1;
+ return;
+ }
+
+ if (dist <= 1) {
+ start = j1-i1;
+ if (j2-i2 == j1-i1) {
+ head1 = (edit_script *)palloc2(sizeof(edit_script), ph);
+ head1->op_type = SUB;
+ head1->num = i2-i1;
+ head1->next = NULL;
+ *head = *tail = head1;
+ } else if (j2-j1 == i2-i1+1) {
+
+ tmp = snake(seq1,seq2,start,i1,i2,j2);
+ if (tmp>i1) {
+ head1 = (edit_script *)palloc2(sizeof(edit_script), ph);
+ head1->op_type = SUB;
+ head1->num = tmp-i1;
+ *head = head1;
+ }
+ head2 = (edit_script *)palloc2(sizeof(edit_script), ph);
+ head2->op_type = INS;
+ head2->num = 1;
+
+ if (*head) head1->next = head2;
+ else *head = head2;
+ *tail = head2;
+ head2->next = NULL;
+
+ if (i2-tmp) {
+ head1 = head2;
+ *tail = head2 = (edit_script *)palloc2(sizeof(edit_script), ph);
+ head2->op_type = SUB;
+ head2->num = i2-tmp;
+ head2->next = NULL;
+ head1->next = head2;
+ }
+ } else if (j2-j1+1 == i2-i1) {
+
+ tmp = snake(seq1,seq2,start,i1,i2,j2);
+ if (tmp>i1) {
+ head1 = (edit_script *)palloc2(sizeof(edit_script), ph);
+ head1->op_type = SUB;
+ head1->num = tmp-i1;
+ *head = head1;
+ }
+ head2 = (edit_script *)palloc2(sizeof(edit_script), ph);
+ head2->op_type = DEL;
+ head2->num = 1;
+
+ if (*head) head1->next = head2;
+ else *head = head2;
+ *tail = head2;
+ head2->next = NULL;
+
+ if (i2>tmp+1) {
+ head1 = head2;
+ *tail = head2 = (edit_script *)palloc2(sizeof(edit_script), ph);
+ head2->op_type = SUB;
+ head2->num = i2-tmp-1;
+ head2->next = NULL;
+ head1->next = head2;
+ }
+ } else {
+ fprintf(stderr, "halign::align_path()-- warning: something wrong when aligning.");
+ }
+ return;
+ }
+
+ /* Divide the problem at the middle cost */
+ midc = dist/2;
+ rmidc = dist - midc;
+
+ /* Compute the boundary diagonals */
+ start = j1 - i1;
+ lower = max(j1-i2, start-midc);
+ upper = min(j2-i1, start+midc);
+ rstart = j2-i2;
+ rlower = max(j1-i2, rstart-rmidc);
+ rupper = min(j2-i1, rstart+rmidc);
+
+ /* Allocate space for forward vectors */
+ last_d = (int *)palloc2((upper-lower+1)*sizeof(int), ph) - lower;
+ temp_d = (int *)palloc2((upper-lower+1)*sizeof(int), ph) - lower;
+
+ for (k=lower; k<=upper; k++) last_d[k] = -1;
+ last_d[start] = snake(seq1,seq2,start,i1,i2,j2);
+
+ /* Forward computation */
+ for (c=1; c<=midc; ++c) {
+ ll = max(lower,start-c);
+ uu = min(upper,start+c);
+ for (k=ll; k<=uu; ++k) {
+ if (k == ll) {
+ /* DELETE : down from (k+1,c-1) */
+ row = last_d[k+1]+1;
+ } else if (k == uu) {
+ /* INSERT : right from (k-1,c-1) */
+ row = last_d[k-1];
+ } else if ((last_d[k]>=last_d[k+1]) &&
+ (last_d[k]+1>=last_d[k-1])) {
+ /* SUBSTITUTE */
+ row = last_d[k]+1;
+ } else if ((last_d[k+1]+1>=last_d[k-1]) &&
+ (last_d[k+1]>=last_d[k])) {
+ /* DELETE */
+ row = last_d[k+1]+1;
+ } else {
+ /* INSERT */
+ row = last_d[k-1];
+ }
+
+ temp_d[k] = snake(seq1,seq2,k,row,i2,j2);
+ }
+ for (k=ll; k<=uu; ++k)
+ last_d[k] = temp_d[k];
+ }
+
+ /* Allocate space for backward vectors */
+ rlast_d = (int *)palloc2((rupper-rlower+1)*sizeof(int), ph) - rlower;
+ rtemp_d = (int *)palloc2((rupper-rlower+1)*sizeof(int), ph) - rlower;
+
+ for (k=rlower; k<=rupper; k++) rlast_d[k] = i2+1;
+ rlast_d[rstart] = rsnake(seq1,seq2,rstart,i2,i1,j1,M,N);
+
+ /* Backward computation */
+ for (c=1; c<=rmidc; ++c) {
+ ll = max(rlower,rstart-c);
+ uu = min(rupper,rstart+c);
+ for (k=ll; k<=uu; ++k) {
+ if (k == ll) {
+ /* INSERT : left from (k+1,c-1) */
+ row = rlast_d[k+1];
+ } else if (k == uu) {
+ /* DELETE : up from (k-1,c-1) */
+ row = rlast_d[k-1]-1;
+ } else if ((rlast_d[k]-1<=rlast_d[k+1]) &&
+ (rlast_d[k]-1<=rlast_d[k-1]-1)) {
+ /* SUBSTITUTE */
+ row = rlast_d[k]-1;
+ } else if ((rlast_d[k-1]-1<=rlast_d[k+1]) &&
+ (rlast_d[k-1]-1<=rlast_d[k]-1)) {
+ /* DELETE */
+ row = rlast_d[k-1]-1;
+ } else {
+ /* INSERT */
+ row = rlast_d[k+1];
+ }
+
+ rtemp_d[k] = rsnake(seq1,seq2,k,row,i1,j1,M,N);
+ }
+ for (k=ll; k<=uu; ++k)
+ rlast_d[k] = rtemp_d[k];
+ }
+
+ /* Find (mi, mj) such that the distance from (i1, j1) to (mi, mj) is
+ midc and the distance from (mi, mj) to (i2, j2) is rmidc.
+ */
+
+ flag = 0;
+ mi = i1; mj = j1;
+ ll = max(lower,rlower);
+ uu = min(upper,rupper);
+
+ for (k=ll; k<=uu; ++k) {
+ if (last_d[k]>=rlast_d[k]) {
+ if (last_d[k]-i1>=i2-rlast_d[k]) {
+ mi = last_d[k]; mj = k+mi;
+ } else {
+ mi = rlast_d[k]; mj = k+mi;
+ }
+ flag = 1;
+
+ break;
+ }
+ }
+
+ if (flag) {
+ /* Find a path from (i1,j1) to (mi,mj) */
+ align_path(seq1,seq2,i1,j1,mi,mj,midc,&head1,&tail1,M,N,ph);
+
+ /* Find a path from (mi,mj) to (i2,j2) */
+ align_path(seq1,seq2,mi,mj,i2,j2,rmidc,&head2,&tail2,M,N,ph);
+
+ /* Join these two paths together */
+ if (head1) tail1->next = head2;
+ else head1 = head2;
+ } else {
+ fprintf(stderr, "halign::align_path()-- warning: something wrong when dividing\n");
+ head1 = NULL;
+ }
+ *head = head1;
+ if (head2) *tail = tail2;
+ else *tail = tail1;
+}
+
+
+
+
+
+void
+halign(const char *seq1,
+ const char *seq2,
+ const int len1,
+ const int len2,
+ char *alnline1,
+ char *alnline2) {
+ edit_script *head, *tail, *tp;
+ int i;
+ void *ph;
+
+ ph = pallochandle(0);
+
+ align_path(seq1, seq2,
+ 0, 0,
+ len1, len2,
+ align_get_dist(seq1, seq2, 0, 0, len1, len2, len1+len2, ph),
+ &head, &tail,
+ len1, len2,
+ ph);
+
+ /* generate the alignment(s) */
+
+ *alnline1 = 0;
+ *alnline2 = 0;
+
+ for (tp=head; tp; tp=tp->next) {
+ switch (tp->op_type) {
+ case SUB:
+ for (i=0; i<tp->num; i++) {
+ if (toUpper[*seq1] == toUpper[*seq2]) {
+ *alnline1 = toLower[*seq1];
+ *alnline2 = toLower[*seq2];
+ } else {
+ *alnline1 = toUpper[*seq1];
+ *alnline2 = toUpper[*seq2];
+ }
+ seq1++;
+ seq2++;
+ alnline1++;
+ alnline2++;
+ }
+ break;
+
+ case INS:
+ for (i=0; i<tp->num; i++) {
+ *alnline1 = '-';
+ *alnline2 = toUpper[*seq2];
+ seq2++;
+ alnline1++;
+ alnline2++;
+ }
+ break;
+
+ case DEL:
+ for (i=0; i<tp->num; i++) {
+ *alnline2 = '-';
+ *alnline1 = toUpper[*seq1];
+ seq1++;
+ alnline1++;
+ alnline2++;
+ }
+ break;
+
+ default:
+ fprintf(stderr, "halign::halign()-- unrecognized op_type in script. %d\n", tp->op_type);
+ exit(0);
+ }
+ }
+ *alnline1 = 0;
+ *alnline2 = 0;
+
+ pfree2(ph);
+ pfreehandle(ph);
+}
diff --git a/libbio/kmer.C b/libbio/kmer.C
new file mode 100644
index 0000000..104f001
--- /dev/null
+++ b/libbio/kmer.C
@@ -0,0 +1,497 @@
+#include "kmer.H"
+
+kMerBuilder::kMerBuilder(uint32 ms, uint32 cm, char *tm) {
+ _style = 0;
+
+ _merSize = 0;
+ _merSizeValid = 0L;
+ _merSizeValidIs = 0;
+ _merSizeValidZero = 0;
+
+ _merStorage = 0L;
+ _fMer = 0L;
+ _rMer = 0L;
+
+ _compression = 0;
+ _compressionIndex = 0;
+ _compressionFirstIndex = 0;
+ _compressionLength = 0L;
+ _compressionCurrentLength = 0;
+
+ _templateSpan = 0;
+ _templateLength = 0;
+ _template = 0L;
+ _templatePos = 0;
+ _templateMer = 0;
+ _templateFirst = 0;
+
+ if (ms) {
+ _style = 0;
+ _merSize = ms;
+ _merSizeValidIs = _merSize + _merSize;
+ _merSizeValidZero = _merSize;
+ }
+
+ if (cm) {
+ _style = 1;
+
+ _merSize = ms;
+ _merSizeValidIs = _merSize + _merSize;
+ _merSizeValidZero = _merSize;
+
+ _compression = cm;
+ _compressionIndex = 0;
+ _compressionFirstIndex = 0;
+ _compressionLength = 0L;
+ _compressionCurrentLength = 0;
+
+ assert(_compression < _merSize);
+ }
+
+ if (tm) {
+ _style = 2;
+ _merSize = 0;
+ _templateSpan = strlen(tm);
+ _templateLength = 0;
+ _template = new char [_templateSpan + 1];
+ _templatePos = 0;
+ _templateMer = 0;
+ _templateFirst = 1;
+
+ // Templates cannot begin or end in zero -- they MUST begin/end
+ // with a letter. We silently fix these problems. Unless there
+ // are no 1's in the string, then we bail.
+
+ uint32 i=0, t=0;
+ while ((i < _templateSpan) && (tm[i] == '0'))
+ i++;
+
+ if (i == _templateSpan) {
+ fprintf(stderr, "invalid kMerBuilder template '%s' -- its empty!\n", tm);
+ exit(1);
+ }
+
+ while (i < _templateSpan) {
+ _template[t] = 0;
+
+ if (tm[i] == '1') {
+ _template[t] = 1;
+ _merSize++;
+ }
+
+ i++;
+ t++;
+ }
+
+ while (_template[--t] == 0)
+ ;
+
+ _templateSpan = t + 1;
+
+#ifdef DEBUGSPACE
+ for (i=0; i<_templateSpan; i++)
+ fprintf(stderr, "%d", _template[i]);
+ fprintf(stderr, " -- %d\n", _templateSpan);
+#endif
+
+ // Look for patterns in the template, set templateLength to be the
+ // size of the pattern.
+
+ _templateLength = _templateSpan;
+
+ // Finally, we can set what valid and zero mersizes are.
+ _merSizeValidIs = _templateLength + _merSize;
+ _merSizeValidZero = _templateLength;
+ }
+
+ if (cm && tm) {
+ _style = 3;
+ assert(0);
+ }
+
+
+ if (_merSize > KMER_WORDS * 32)
+ fprintf(stderr, "kMer size too large; increase KMER_WORDS in libbio/kmer.H\n"), exit(1);
+
+
+ _compressionLength = new uint32 [_merSize];
+
+ for (uint32 z=0; z<_merSize; z++)
+ _compressionLength[z] = (cm) ? 0 : 1;
+
+
+
+ if (tm) {
+ _merStorage = new kMer [_templateLength * 2];
+ _merSizeValid = new uint32 [_templateLength];
+
+ for (uint32 i=0; i<2*_templateLength; i++) {
+ _merStorage[i].setMerSize(_merSize);
+ _merStorage[i].setMerSpan(_templateSpan);
+ }
+
+ // VERY IMPORTANT! Offset the valid length to adjust for the
+ // template that every mer except the first is starting in the
+ // middle of.
+ //
+ for (uint32 i=0; i<_templateLength; i++)
+ _merSizeValid[i] = _merSize - i;
+ } else {
+ _merStorage = new kMer [2];
+ _merSizeValid = new uint32 [1];
+
+ _merStorage[0].setMerSize(_merSize);
+ _merStorage[1].setMerSize(_merSize);
+
+ _merSizeValid[0] = _merSizeValidZero;
+
+ if (cm) {
+ _merStorage[0].setMerSpan(0);
+ _merStorage[1].setMerSpan(0);
+ }
+ }
+
+ _fMer = _merStorage + 0;
+ _rMer = _merStorage + 1;
+}
+
+
+
+kMerBuilder::~kMerBuilder() {
+ delete [] _merSizeValid;
+ delete [] _merStorage;
+ delete [] _compressionLength;
+ delete [] _template;
+}
+
+
+
+void
+kMerBuilder::clear(bool clearMer) {
+
+ // Contiguous mers
+ _merSizeValid[0] = _merSizeValidZero;
+
+ // Compressed mers
+ if (_compression) {
+ _compressionIndex = 0;
+ _compressionFirstIndex = 0;
+ _compressionCurrentLength = 0;
+
+ for (uint32 z=0; z<_merSize; z++)
+ _compressionLength[z] = 0;
+
+ _merStorage[0].setMerSpan(0);
+ _merStorage[1].setMerSpan(0);
+ }
+
+ // Spaced mers
+ if (_template) {
+ for (uint32 i=0; i<2*_templateLength; i++)
+ _merStorage[i].clear();
+
+ for (uint32 i=0; i<_templateLength; i++)
+ _merSizeValid[i] = _merSize - i;
+
+ _templatePos = 0;
+ _templateMer = 0;
+ _templateFirst = 1;
+ }
+
+ if (clearMer) {
+ _fMer->clear();
+ _rMer->clear();
+ }
+}
+
+
+
+
+//
+// The addBase methods add a single base (cf - forward, cr - complemented) to
+// the mer. The return true if another base is needed to finish the mer, and
+// false if the mer is complete.
+//
+
+
+
+
+
+bool
+kMerBuilder::addBaseContiguous(uint64 cf, uint64 cr) {
+
+ // Not a valid base, reset the mer to empty, and request more bases
+ // (this is a slightly optimized version of clear()).
+ if (cf & (unsigned char)0xfc) {
+ clear(false);
+ //_merSizeValid[0] = _merSizeValidZero;
+ return(true);
+ }
+
+ // Add the base to both mers.
+ *_fMer += cf;
+ *_rMer -= cr;
+
+ // If there aren't enough bases, request another one.
+ if (_merSizeValid[0] + 1 < _merSizeValidIs) {
+ _merSizeValid[0]++;
+ return(true);
+ }
+
+ return(false); // Good! Don't need another letter.
+}
+
+
+
+
+
+
+bool
+kMerBuilder::addBaseCompressed(uint64 cf, uint64 cr) {
+
+ // Not a valid base, reset the mer to empty, and request more bases.
+ //
+ if (cf & (unsigned char)0xfc) {
+ clear();
+ return(true);
+ }
+
+ uint64 lb = theFMer().endOfMer(2); // Last base in the mer
+ uint32 ms = theFMer().getMerSpan(); // Span BEFORE adding the mer
+
+ if (_merSizeValid[0] <= _merSizeValidZero)
+ lb = 9; // No valid last base (should probably be ~uint64ZERO, but that screws up diagnostic output)
+
+#ifdef DEBUGCOMP
+ fprintf(stderr, "kMerBuilder::addBaseCompressed()-- lb="uint64FMT" cf="uint64FMT" ms="uint32FMT" ccl="uint32FMT" lvl="uint32FMT"\n",
+ lb, cf, ms, _compressionCurrentLength, _compression);
+#endif
+
+ // Always add one to the current length. When we started, it
+ // was 0. This represents the length AFTER adding the base.
+ //
+ _compressionCurrentLength++;
+
+ // If the lastbase is the same as the one we want to add (and
+ // there IS a last base), and we've seen too many of these,
+ // remember we've seen another letter in the run, and don't add
+ // it. Request another letter.
+ //
+ if ((lb == cf) && // last is the same as this
+ (_compressionCurrentLength > _compression)) { // run is already too big
+ _compressionLength[_compressionIndex]++;
+
+ _fMer->setMerSpan(ms + 1);
+ _rMer->setMerSpan(ms + 1);
+
+#ifdef DEBUGCOMP
+ fprintf(stderr, "kMerBuilder::addBaseCompressed()-- COMPRESSED currentIdx=%u first=%u",
+ _compressionIndex, _compressionFirstIndex);
+ for (uint32 x=0, y=_compressionFirstIndex; x<_merSize; x++) {
+ fprintf(stderr, " %u(%d)", _compressionLength[y], y);
+ y = (y + 1) % _merSize;
+ }
+ fprintf(stderr, "\n");
+#endif
+ return(true);
+ }
+
+ // Else, it's a new run (a different letter) or our run isn't
+ // big enough to compress and we need to add the duplicate
+ // letter.
+
+ *_fMer += cf;
+ *_rMer -= cr;
+
+ // If this is a new letter, propagate the current length to the first letter in this run. That
+ // way, when that letter is popped off the mer, we automagically update our span to include only
+ // as many letters as are here.
+ //
+ // 01234567890
+ //
+ // E.g. For sequence TATTTTTTAGT (that's 6 T's) with a mersize of 3 and compression 2, we'd have
+ // mers with position:
+ //
+ // TATTTTTTAGT
+ // #1 TAT position 0 (with lengths 1, 1, 1) uncompressed mer TAT
+ // #2 ATT position 1 (with lengths 1, 1, 1) ATT
+ // #3 TTA position 6 (with lengths 5, 1, 1) TTTTTTA
+ // #4 TAG position 7 TAG
+ // #5 AGT position 8 AGT
+ //
+ // In #2, because the length so far (1) is not >= the compression (2) we add a new base and
+ // return.
+ //
+ // In #3, the current length is >= the compression, so we keep stuffing on T's and incrementing
+ // the last length, stopping when we get the A. We now propagate the current length to the first
+ // letter in the run. Special case, if the first letter in the run is the first letter in the
+ // mer, we need to immediately update the span.
+
+#ifdef DEBUGCOMP
+ fprintf(stderr, "kMerBuilder::addBaseCompressed()-- ADDNEWBASE currentIdx=%u first=%u",
+ _compressionIndex, _compressionFirstIndex);
+ for (uint32 x=0, y=_compressionFirstIndex; x<_merSize; x++) {
+ fprintf(stderr, " %u(%d)", _compressionLength[y], y);
+ y = (y + 1) % _merSize;
+ }
+ fprintf(stderr, "\n");
+#endif
+
+ // If we added a new letter, transfer the run-length count to the first letter in the previous
+ // run. In the above example, when we built the run, the lengths are (1, 1, 5). That is, all
+ // compression occurred on the last letter. When we shift off that first letter, we want to
+ // remove as much of the run as possible.
+
+ if (lb != cf) {
+ if (_compressionFirstIndex != _compressionIndex) {
+ _compressionLength[_compressionFirstIndex] += _compressionLength[_compressionIndex] - 1;
+ _compressionLength[_compressionIndex] = 1;
+ }
+ _compressionFirstIndex = (_compressionIndex + 1) % _merSize;
+ _compressionCurrentLength = 1;
+ }
+
+ _compressionIndex = (_compressionIndex + 1) % _merSize;
+ ms -= _compressionLength[_compressionIndex]; // subtract the count for the letter we just shifted out
+
+#ifdef DEBUGCOMP
+ fprintf(stderr, "kMerBuilder::addBaseCompressed()-- ADDNEWBASE shifted out at idx="uint32FMT" with "uint32FMT" positions; final span "uint32FMT"\n",
+ _compressionIndex,
+ _compressionLength[_compressionIndex],
+ ms + 1);
+#endif
+
+ _compressionLength[_compressionIndex] = 1; // one letter at this position
+
+ _fMer->setMerSpan(ms + 1);
+ _rMer->setMerSpan(ms + 1);
+
+ // If there aren't enough bases, request another one.
+ if (_merSizeValid[0] + 1 < _merSizeValidIs) {
+ _merSizeValid[0]++;
+ return(true);
+ }
+
+ return(false); // Good! Don't need another letter.
+}
+
+
+
+
+
+
+bool
+kMerBuilder::addBaseSpaced(uint64 cf, uint64 cr) {
+#ifdef DEBUGSPACE
+ fprintf(stderr, "add %c templatePos=%u templateMer=%u\n", ch, _templatePos, _templateMer);
+#endif
+
+ // We always advance the templatePos, unfortunately, we need to
+ // use the current value throughout this function. If there
+ // was a single return point, we could advance immediately
+ // before returning.
+ //
+ uint32 tp = _templatePos;
+ _templatePos = (_templatePos + 1) % _templateLength;
+
+ // If we get an invalid letter, set all mers that would have
+ // had a letter added to be broken.
+ //
+ if (cf & (unsigned char)0xfc) {
+
+ for (uint32 m=0; m<_templateLength; m++) {
+ uint32 tppos = (tp + _templateLength - m) % _templateLength;
+
+ if (_template[tppos] == 1) {
+
+ // Reset to 'zero', but make it skip over any remaining
+ // positions in the current template.
+ //
+ _merSizeValid[m] = _merSizeValidZero + tppos - _templateLength + 1;
+
+#ifdef DEBUGSPACE
+ fprintf(stderr, "-- invalid letter, reset mer %u to valid %u (mersizevalidzero=%u ttpos=%u templatelength=%u)\n",
+ m, _merSizeValid[m], _merSizeValidZero, tppos, _templateLength);
+#endif
+ }
+ }
+
+ if (_templateFirst == 0)
+ _templateMer = (_templateMer + 1) % _templateLength;
+
+ return(true);
+ }
+
+ // We have a valid letter, and add it to all the mers that the
+ // template allows.
+ //
+ for (uint32 m=0; m<_templateLength; m++) {
+ uint32 tppos = (tp + _templateLength - m) % _templateLength;
+
+ if (_template[tppos] == 1) {
+ _merStorage[2*m+0] += cf;
+ _merStorage[2*m+1] -= cr;
+
+ if (_merSizeValid[m] < _merSizeValidIs)
+ _merSizeValid[m]++;
+
+#ifdef DEBUGSPACE
+ fprintf(stderr, "push %c onto %d (at template %u) length = %u %s\n",
+ ch, m, (tp + _templateLength - m) % _templateLength,
+ _merSizeValid[m],
+ (_merSizeValid[m] >= _merSizeValidIs) ? "complete" : "");
+#endif
+ } else if (_merSizeValid[m] <= _merSizeValidZero) {
+
+ // The template doesn't want us to add a letter to the mer,
+ // but we're adjusting for an aborted template, and we're
+ // counting template positions (not just non-zero template
+ // positions) when adjusting.
+ //
+ _merSizeValid[m]++;
+ }
+ }
+
+ // If the current mer isn't long enough, we move to the next mer,
+ // and request another letter.
+ //
+ if (_merSizeValid[_templateMer] < _merSizeValidIs) {
+ if (_templateFirst == 0)
+ _templateMer = (_templateMer + 1) % _templateLength;
+#ifdef DEBUGSPACE
+ fprintf(stderr, "-- too short -- need more templateMer=%u templateFirst=%u\n", _templateMer, _templateFirst);
+#endif
+ return(true);
+ }
+
+ // On startup, _templateMer is always 0 (the first mer) until
+ // it is long enough to be a valid mer. Then, we clear
+ // _templateFirst so that we can start advancing through mers.
+
+ // Update the f and r pointers to the correct mers, advance our
+ // template to the next, and terminate.
+ //
+ _fMer = _merStorage + 2 * _templateMer + 0;
+ _rMer = _merStorage + 2 * _templateMer + 1;
+
+#ifdef DEBUGSPACE
+ fprintf(stderr, "-- valid! (templateMer = %u)\n", _templateMer);
+#endif
+
+ _templateFirst = 0;
+ _templateMer = (_templateMer + 1) % _templateLength;
+
+ return(false); // Good! Don't need another letter.
+}
+
+
+
+
+
+
+bool
+kMerBuilder::addBaseCompressedSpaced(uint64 cf, uint64 cr) {
+ fprintf(stderr, "kMerBuilder::addBaseCompressedSpace()-- Compressed and spaced mers not supported.\n");
+ exit(1);
+}
+
diff --git a/libbio/kmer.H b/libbio/kmer.H
new file mode 100644
index 0000000..fac11dd
--- /dev/null
+++ b/libbio/kmer.H
@@ -0,0 +1,160 @@
+// Copyright (c) 2005 J. Craig Venter Institute
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+// A 'simple' kMer datastructure.
+
+#ifndef BIO_KMER_H
+#define BIO_KMER_H
+
+// The maximum size of a mer. You get 32 bases per word, so
+// KMER_WORDS=4 will get you up to a 128-mer.
+//
+#define KMER_WORDS 1
+
+#include "util.h"
+#include "util++.H"
+#include "bio.h"
+#include "bio++.H"
+#include "kmeriface.H"
+
+#if KMER_WORDS == 1
+#include "kmertiny.H"
+typedef kMerTiny kMer;
+#else
+#include "kmerhuge.H"
+typedef kMerHuge kMer;
+#endif
+
+
+#undef DEBUGADDBASE
+#undef DEBUGCOMP
+#undef DEBUGSPACE
+
+
+class kMerBuilder {
+public:
+ kMerBuilder(uint32 ms=0, uint32 cm=0, char *tm=0L);
+ ~kMerBuilder();
+
+ // Clear all mer data, reset state to as just after construction.
+ void clear(bool clearMer=true);
+
+ // Returns true if we need another base to finish the mer. This
+ // only occurs for compressed mers, if we are in a homopolymer run.
+ //
+private:
+ bool addBaseContiguous(uint64 cf, uint64 cr);
+ bool addBaseCompressed(uint64 cf, uint64 cr);
+ bool addBaseSpaced(uint64 cf, uint64 cr);
+ bool addBaseCompressedSpaced(uint64 cf, uint64 cr);
+
+public:
+ bool addBase(char ch) {
+ uint64 cf = letterToBits[ch];
+ uint64 cr = letterToBits[complementSymbol[ch]];
+
+#ifdef DEBUGADDBASE
+ fprintf(stderr, "addBase() %c\n", ch);
+#endif
+
+ if (_style == 0)
+ return(addBaseContiguous(cf, cr));
+
+ if (_style == 1)
+ return(addBaseCompressed(cf, cr));
+
+ if (_style == 2)
+ return(addBaseSpaced(cf, cr));
+
+ if (_style == 3)
+ return(addBaseCompressedSpaced(cf, cr));
+
+ fprintf(stderr, "kMerBuilder::addBase()-- Invalid mer type %d.\n", _style);
+ exit(1);
+
+ return(false);
+ }
+
+ void mask(void) {
+ _fMer->mask(true);
+ _rMer->mask(false);
+ };
+
+ kMer const &theFMer(void) { return(*_fMer); };
+ kMer const &theRMer(void) { return(*_rMer); };
+ kMer const &theCMer(void) { return((theFMer() < theRMer()) ? theFMer() : theRMer()); };
+
+ uint32 merSize(void) { return(_merSize); };
+ uint32 templateSpan(void) { return(_templateSpan); };
+
+ uint32 baseSpan(uint32 b) {
+ return(_compressionLength[(_compressionIndex + 1 + b) % _merSize]);;
+ };
+
+private:
+
+ // Style of builder we are
+ uint32 _style;
+
+ // Amount of the mer that has valid sequence. Sigh. I really needed a signed value here --
+ // where negative values mean that we first have to get to the end of the template that was
+ // invalid, then we need to build a new mer.
+ //
+ // And, yes, just simply making it signed leads to all sortes of compiler warnings about
+ // comparing signed and unsigned. And I've been here before, and those warnings just propate
+ // endlessly. Just go away, Mr. Smartypants.
+ //
+ // Details: when building spaced seeds, if we hit an N in the middle of the template, we need to
+ // invalidate the mer, but not start building a new mer until we exhaust the current template.
+ // The example is template=1101. Suppose we hit an N at the second 1. We set the merSizeValid
+ // to 0, and proceed. When we push on the base for the last 1 in the template, we'd increment
+ // the merSizeValid. The first two 1's in the template would now create a mer big enough to be
+ // valid, and we'd return it -- but now the template we're using is 0111.
+ //
+ // _merSizeValid is offset by _merSize (e.g., the true valid size is _merSizeValid - _merSize).
+ // _merSizeValidIs is the size _merSizeValid needs to be in order for it to be valid.
+ // Similarily, _merSizeValidZero is the value of zero (currently this is equal to _merSize).
+ //
+ uint32 _merSize; // Desired number of bases in the mer
+ uint32 *_merSizeValid; // Actual number of bases in the mer
+ uint32 _merSizeValidZero; // Definition of 'zero' bases in the mer
+ uint32 _merSizeValidIs; // Definition of 'full' bases in the mer
+
+ // An array of mers, we allocate all mers in one block
+ kMer *_merStorage;
+
+ // Pointer to the currently active mer
+ kMer *_fMer;
+ kMer *_rMer;
+
+ // For compression
+ uint32 _compression;
+ uint32 _compressionIndex; // index into cL[] that is the last base in the mer
+ uint32 _compressionFirstIndex; // index into cL[] that is the first base in a run
+ uint32 *_compressionLength; // one per base
+ uint32 _compressionCurrentLength;
+
+ // For templates
+ uint32 _templateSpan; // # of 0's and 1's in the template
+ uint32 _templateLength; // length of the pattern in the template
+ char *_template; // character string template
+ uint32 _templatePos; // position we are building in the template
+ uint32 _templateMer; // the mer we should output next
+ uint32 _templateFirst; // if true, we're still building the initial mer
+};
+
+#endif // BIO_KMER_H
diff --git a/libbio/kmerhuge.H b/libbio/kmerhuge.H
new file mode 100644
index 0000000..13503bb
--- /dev/null
+++ b/libbio/kmerhuge.H
@@ -0,0 +1,396 @@
+
+#define MERWORD(N) _md[N]
+
+
+class kMerHuge {
+public:
+ kMerHuge(uint32 ms=uint32ZERO) {
+ setMerSize(ms);
+ clear();
+ };
+ ~kMerHuge() {
+ };
+
+ void setMerSize(uint32 ms);
+ uint32 getMerSize(void) const { return(_merSize); };
+
+ void setMerSpan(uint32 ms) { _merSpan = ms; };
+ uint32 getMerSpan(void) const { return(_merSpan); };
+
+ kMerHuge &reverseComplement(void) {
+ for (uint32 i=0, j=KMER_WORDS-1; i<KMER_WORDS/2; i++, j--) {
+ uint64 t = MERWORD(i);
+ MERWORD(i) = MERWORD(j);
+ MERWORD(j) = t;
+ }
+
+ for (uint32 i=0; i<KMER_WORDS; i++) {
+ MERWORD(i) = ((MERWORD(i) >> 2) & 0x3333333333333333llu) | ((MERWORD(i) << 2) & 0xccccccccccccccccllu);
+ MERWORD(i) = ((MERWORD(i) >> 4) & 0x0f0f0f0f0f0f0f0fllu) | ((MERWORD(i) << 4) & 0xf0f0f0f0f0f0f0f0llu);
+ MERWORD(i) = ((MERWORD(i) >> 8) & 0x00ff00ff00ff00ffllu) | ((MERWORD(i) << 8) & 0xff00ff00ff00ff00llu);
+ MERWORD(i) = ((MERWORD(i) >> 16) & 0x0000ffff0000ffffllu) | ((MERWORD(i) << 16) & 0xffff0000ffff0000llu);
+ MERWORD(i) = ((MERWORD(i) >> 32) & 0x00000000ffffffffllu) | ((MERWORD(i) << 32) & 0xffffffff00000000llu);
+ MERWORD(i) ^= 0xffffffffffffffffllu;
+ }
+
+ *this >>= KMER_WORDS * 64 - 2 * _merSize;
+
+ return(*this);
+ };
+
+
+ void clear(void) {
+ for (uint32 i=0; i<KMER_WORDS; i++)
+ MERWORD(i) = uint64ZERO;
+ };
+ void smallest(void) {
+ clear();
+ };
+ void largest(void) {
+ clear();
+ reverseComplement();
+ };
+
+private:
+ void operator>>=(uint32 x) {
+
+ // thisWord, the word we shift bits into
+ // thatWord, the word we shift bits out of
+ // shift, the number of bits we shift
+ //
+ uint32 thisWord = 0;
+ uint32 thatWord = x >> 6;
+ uint32 shift = x & uint32MASK(6);
+
+ // Do an initial word-size shift, to reduce the shift amount to
+ // be less than wordsize. Fill any shifted-out words with zero.
+ //
+ if (thatWord) {
+ while (thatWord < KMER_WORDS)
+ MERWORD(thisWord++) = MERWORD(thatWord++);
+ while (thisWord < KMER_WORDS)
+ MERWORD(thisWord++) = 0;
+ }
+
+ // Do bit-size shift, of adjacent words
+ //
+ thisWord = 0;
+ thatWord = 1;
+ MERWORD(thisWord) >>= shift;
+ while (thatWord < KMER_WORDS) {
+ MERWORD(thisWord++) |= MERWORD(thatWord) << (64 - shift);
+ MERWORD(thatWord++) >>= shift;
+ }
+ };
+
+ void operator<<=(uint32 x) {
+ uint32 thisWord = KMER_WORDS;
+ uint32 thatWord = KMER_WORDS - (x >> 6);
+ uint32 shift = x & uint32MASK(6);
+
+ if (thatWord != KMER_WORDS) {
+ while (thatWord > 0)
+ MERWORD(--thisWord) = MERWORD(--thatWord);
+ while (thisWord > 0)
+ MERWORD(--thisWord) = 0;
+ }
+
+ thisWord = KMER_WORDS;
+ thatWord = KMER_WORDS - 1;
+ MERWORD(thisWord-1) <<= shift;
+ while (thatWord > 0) {
+ --thisWord;
+ --thatWord;
+ MERWORD(thisWord) |= MERWORD(thatWord) >> (64 - shift);
+ MERWORD(thatWord) <<= shift;
+ }
+ };
+
+
+public:
+ void operator+=(uint64 x) {
+ *this <<= 2;
+ assert((x & 0xfc) == 0);
+ MERWORD(0) |= x & uint64NUMBER(0x3);
+ };
+ void operator-=(uint64 x) {
+ *this >>= 2;
+ assert((x & 0xfc) == 0);
+ MERWORD(_lastWord) |= (x & uint64NUMBER(0x3)) << _lastShift;
+ };
+
+ void mask(bool full) {
+ MERWORD(_maskWord) &= _mask;
+ if (full)
+ for (uint32 x=_maskWord+1; x<KMER_WORDS; x++)
+ MERWORD(x) = uint64ZERO;
+ };
+
+public:
+ bool operator!=(kMerHuge const &r) const {
+ uint64 res = uint64ZERO;
+ for (uint32 i=KMER_WORDS; i--; )
+ res |= MERWORD(i) ^ r.MERWORD(i);
+ return(res != uint64ZERO);
+ };
+ bool operator==(kMerHuge const &r) const {
+ uint64 res = uint64ZERO;
+ for (uint32 i=KMER_WORDS; i--; )
+ res |= MERWORD(i) ^ r.MERWORD(i);
+ return(res == uint64ZERO);
+ };
+
+ bool operator<(kMerHuge const &r) const {
+ for (uint32 i=KMER_WORDS; i--; ) {
+ if (MERWORD(i) < r.MERWORD(i)) return(true);
+ if (MERWORD(i) > r.MERWORD(i)) return(false);
+ }
+ return(false);
+ };
+ bool operator>(kMerHuge const &r) const {
+ for (uint32 i=KMER_WORDS; i--; ) {
+ if (MERWORD(i) > r.MERWORD(i)) return(true);
+ if (MERWORD(i) < r.MERWORD(i)) return(false);
+ }
+ return(false);
+ };
+ bool operator<=(kMerHuge const &r) const {
+ for (uint32 i=KMER_WORDS; i--; ) {
+ if (MERWORD(i) < r.MERWORD(i)) return(true);
+ if (MERWORD(i) > r.MERWORD(i)) return(false);
+ }
+ return(true);
+ };
+ bool operator>=(kMerHuge const &r) const {
+ for (uint32 i=KMER_WORDS; i--; ) {
+ if (MERWORD(i) > r.MERWORD(i)) return(true);
+ if (MERWORD(i) < r.MERWORD(i)) return(false);
+ }
+ return(true);
+ };
+ int qsort_less(kMerHuge const &r) const {
+ for (uint32 i=KMER_WORDS; i--; ) {
+ if (MERWORD(i) < r.MERWORD(i)) return(-1);
+ if (MERWORD(i) > r.MERWORD(i)) return(1);
+ }
+ return(0);
+ };
+
+
+public:
+ operator uint64 () const {return(MERWORD(0));};
+
+
+public:
+ // these should work generically for both big and small
+
+ void writeToBitPackedFile(bitPackedFile *BPF, uint32 numBits=0) const {
+ if (numBits == 0)
+ numBits = _merSize << 1;
+
+ uint32 lastWord = numBits >> 6;
+
+ if ((numBits & uint32MASK(6)) == 0)
+ lastWord++;
+
+ if (numBits & uint32MASK(6))
+ BPF->putBits(MERWORD(lastWord), numBits & uint32MASK(6));
+ while (lastWord > 0) {
+ lastWord--;
+ BPF->putBits(MERWORD(lastWord), 64);
+ }
+ };
+ void readFromBitPackedFile(bitPackedFile *BPF, uint32 numBits=0) {
+ if (numBits == 0)
+ numBits = _merSize << 1;
+
+ uint32 lastWord = numBits >> 6;
+
+ if ((numBits & uint32MASK(6)) == 0)
+ lastWord++;
+
+ if (numBits & uint32MASK(6))
+ MERWORD(lastWord) = BPF->getBits(numBits & uint32MASK(6));
+ while (lastWord > 0) {
+ lastWord--;
+ MERWORD(lastWord) = BPF->getBits(64);
+ }
+ };
+
+
+public:
+ // these should work generically for both big and small
+
+ void setBits(uint32 pos, uint32 numbits, uint64 val) {
+ uint32 wrd = pos >> 6;
+ uint32 bit = pos & 0x3f;
+
+ val &= uint64MASK(numbits);
+
+ if (wrd >= KMER_WORDS) {
+ fprintf(stderr, "kMer::setBits()-- ERROR: tried to set pos="uint32FMT" numbits="uint32FMT" larger than KMER_WORDS=%d\n",
+ pos, numbits, KMER_WORDS), exit(1);
+ }
+
+ // If we have enough space in the word for the bits, replace
+ // those bits in the word. Otherwise we need to split the value
+ // into two pieces, and add to the end of the first word and the
+ // start of the second.
+
+ if (64 - bit >= numbits) {
+ MERWORD(wrd) &= ~(uint64MASK(numbits) << bit);
+ MERWORD(wrd) |= val << bit;
+ } else {
+ if (wrd+1 >= KMER_WORDS) {
+ fprintf(stderr, "kMer::setBits()-- ERROR: tried to set pos="uint32FMT" numbits="uint32FMT" larger than KMER_WORDS=%d\n",
+ pos, numbits, KMER_WORDS), exit(1);
+ }
+
+ uint32 b1 = 64 - bit; // bits in the first word
+ uint32 b2 = numbits - b1; // bits in the second word
+
+ MERWORD(wrd) &= ~(uint64MASK(b1) << bit);
+ MERWORD(wrd) |= (val & uint64MASK(b1)) << bit;
+
+ MERWORD(wrd+1) &= ~(uint64MASK(b2));
+ MERWORD(wrd+1) |= (val >> b1) & uint64MASK(b2);
+ }
+ };
+
+ uint64 getBits(uint32 pos, uint32 numbits) const {
+ uint64 val = uint64ZERO;
+ uint32 wrd = pos >> 6;
+ uint32 bit = pos & 0x3f;
+
+ if (wrd >= KMER_WORDS) {
+ fprintf(stderr, "kMer::getBits()-- ERROR: tried to get pos="uint32FMT" numbits="uint32FMT" larger than KMER_WORDS=%d\n",
+ pos, numbits, KMER_WORDS), exit(1);
+ }
+
+ if (64 - bit >= numbits) {
+ val = MERWORD(wrd) >> bit;
+ } else {
+ if (wrd+1 >= KMER_WORDS) {
+ fprintf(stderr, "kMer::getBits()-- ERROR: tried to get pos="uint32FMT" numbits="uint32FMT" larger than KMER_WORDS=%d\n",
+ pos, numbits, KMER_WORDS), exit(1);
+ }
+
+ uint32 b1 = 64 - bit; // bits in the first word
+ uint32 b2 = numbits - b1; // bits in the second word
+
+ val = MERWORD(wrd) >> (64-b1);
+ val |= (MERWORD(wrd+1) & uint64MASK(b2)) << b1;
+ }
+
+ val &= uint64MASK(numbits);
+ return(val);
+ };
+
+
+public:
+ // these should work generically for both big and small
+
+ uint64 startOfMer(uint32 bits) const {
+ return(getBits((_merSize << 1) - bits, bits));
+ };
+ uint64 endOfMer(uint32 bits) const {
+ return(MERWORD(0) & uint64MASK(bits));
+ };
+
+public:
+ // these should work generically for both big and small
+ uint64 getWord(uint32 wrd) const { return(MERWORD(wrd)); };
+ void setWord(uint32 wrd, uint64 val) { MERWORD(wrd) = val; };
+
+public:
+ char *merToString(char *instr) const;
+
+private:
+ uint64 _md[KMER_WORDS];
+
+ // The _merSize is always the number of letters in the mer -- if we
+ // are a spaced seed, it is the weight.
+ //
+ uint32 _merSize;
+ uint32 _merSpan;
+
+ // The mask is used to make sure the mer has only _merSize bases
+ // set -- we can get more than that if we shift to the left. The
+ // _maskWord is the word that we want to mask:
+ //
+ uint64 _mask;
+ uint32 _maskWord;
+
+ // For operator-=() (add a base to the left end) we need to know
+ // what the last word is, and how far to shift the bits.
+ //
+ // _lastWord -- the last word that contains bases
+ // _lastShift -- the amount we need to shift left to put bits 0 and 1
+ // into the last base
+ uint32 _lastWord;
+ uint32 _lastShift;
+};
+
+
+
+inline
+void
+kMerHuge::setMerSize(uint32 ms) {
+ _merSize = ms;
+ _merSpan = ms;
+ _lastWord = (2 * ms - 2) / 64;
+ _lastShift = (2 * ms - 2) % 64;
+
+ _mask = uint64ZERO;
+ _maskWord = _merSize / 32;
+
+ // Filled whole words with the mer, the mask is special-cased
+ // to clear the whole next word, unless there is no whole next
+ // word, then it does nothing on the last word.
+ //
+ // Otherwise, we can construct the mask as usual.
+ //
+ if ((_merSize % 32) == 0) {
+ if (_maskWord >= KMER_WORDS) {
+ _maskWord = KMER_WORDS - 1;
+ _mask = ~uint64ZERO;
+ } else {
+ _maskWord = _merSize / 32;
+ _mask = uint64ZERO;
+ }
+ } else {
+ _mask = uint64MASK((_merSize % 32) << 1);
+ }
+
+ if (_maskWord >= KMER_WORDS) {
+ fprintf(stderr, "kMer::setMerSize()-- ERROR! Desired merSize of "uint32FMT" larger than\n", _merSize);
+ fprintf(stderr, " available storage space (KMER_WORDS=%d, max merSize %d).\n", KMER_WORDS, KMER_WORDS*32);
+ exit(1);
+ }
+}
+
+
+
+
+inline
+char *
+kMerHuge::merToString(char *instr) const {
+ uint32 lastWord = _merSize >> 5;
+ char *str = instr;
+
+ if ((_merSize & uint32MASK(6)) == 0)
+ lastWord++;
+
+ if (_merSize & uint32MASK(5)) {
+ uint64ToMerString(_merSize & uint32MASK(5), MERWORD(lastWord), str);
+ str += _merSize & uint32MASK(5);
+ }
+
+ while (lastWord > 0) {
+ lastWord--;
+ uint64ToMerString(32, MERWORD(lastWord), str);
+ str += 32;
+ }
+
+ return(instr);
+};
diff --git a/libbio/kmeriface.H b/libbio/kmeriface.H
new file mode 100644
index 0000000..fc0f6b5
--- /dev/null
+++ b/libbio/kmeriface.H
@@ -0,0 +1,83 @@
+
+#if 0
+
+// Documentation, really.
+
+// Incomplete too.
+
+class kMerInterface {
+ kMerInterface() {};
+ virtual ~kMerInterface() {};
+
+ // Reverse all the words, reverse and complement the bases in
+ // each word, then shift right to align the edge.
+ //
+ virtual kMerInterface &reverseComplement(void) = 0;
+ virtual void clear(void);
+
+ // Construct a mer by shifting bases onto the end:
+ // += shifts onto the right end
+ // -= shifts onto the left end
+ //
+ virtual void operator+=(uint64 x) = 0;
+ virtual void operator-=(uint64 x) = 0;
+
+ // used by merStream at least
+ //
+ virtual void mask(bool) = 0;
+
+ // Return the mer, as a 64-bit integer. If the mer is more than
+ // 32-bases long, then the left-most (the earliest, the start, etc)
+ // bases are used.
+ //
+ virtual operator uint64 () const = 0;
+
+ // These are written/read in 5'endian, which isn't the most natural
+ // implementation. It's done this way to keep the sequence in
+ // order (e.g., the merStreamFile). Don't change the order.
+ //
+ // On the otherhand, the implementation (of write anyway) is
+ // basically the same as merToString().
+ //
+ // Takes an optional number of BITS to write, pulled from the
+ // END of the mer.
+ //
+ virtual void writeToBitPackedFile(bitPackedFile *BPF, uint32 numBits=0) const = 0;
+ virtual void readFromBitPackedFile(bitPackedFile *BPF, uint32 numBits=0) = 0;
+
+ // Returns a sub-mer from either the start (left end) or the end
+ // (right end) of the mer. The sub-mer must be at most 64 bits
+ // long. Yes, BITS.
+ //
+ // The start is difficult, because it can span multiple words. The
+ // end is always in the first word.
+ //
+ virtual uint64 startOfMer(uint32 bits) const = 0;
+ virtual uint64 endOfMer(uint32 bits) const = 0;
+
+ // Set 'numbits' bits from (the end of) 'val' at bit position 'pos'
+ // in the mer. This is wildly low-level, but merylStreamReader
+ // needs it.
+ //
+ // The position is measured from the right end.
+ // (0, 8, X) would copy the bits 7 to 0 of X to bits 7 to 0 of the mer.
+ //
+ // Argh! Can't use set/getDecodedValue because that is doing things in the wrong order.
+ //
+ // Meryl
+ //
+ virtual uint64 getWord(uint32 wrd) const = 0; // { return(MERWORD(wrd)); };
+ virtual void setWord(uint32 wrd, uint64 val) = 0; // { MERWORD(wrd) = val; };
+
+ // Show the mer as ascii
+ //
+ // Doesn't print the last full word, if it's on the word boundary
+ //
+ // We build the string right to left, print any partial word first,
+ // then print whole words until we run out of words to print.
+ //
+ virtual char *merToString(char *instr) const = 0;
+};
+
+
+#endif
diff --git a/libbio/kmertiny.H b/libbio/kmertiny.H
new file mode 100644
index 0000000..55f436c
--- /dev/null
+++ b/libbio/kmertiny.H
@@ -0,0 +1,147 @@
+
+class kMerTiny {
+public:
+ kMerTiny(uint32 ms=uint32ZERO) {
+ setMerSize(ms);
+ clear();
+ };
+ ~kMerTiny() {
+ };
+
+ void setMerSize(uint32 ms);
+ uint32 getMerSize(void) const { return(_merSize); };
+
+ void setMerSpan(uint32 ms) { _merSpan = ms; };
+ uint32 getMerSpan(void) const { return(_merSpan); };
+
+ kMerTiny &reverseComplement(void) {
+ _md = reverseComplementMer(_merSize, _md);
+ return(*this);
+ };
+
+ void clear(void) {
+ _md = uint64ZERO;
+ };
+ void smallest(void) {
+ clear();
+ };
+ void largest(void) {
+ clear();
+ reverseComplement();
+ };
+
+private:
+ void operator>>=(uint32 x) {
+ _md >>= x;
+ };
+ void operator<<=(uint32 x) {
+ _md <<= x;
+ };
+
+public:
+ void operator+=(uint64 x) {
+ *this <<= 2;
+ assert((x & 0xfc) == 0);
+ _md |= x & uint64NUMBER(0x3);
+ };
+ void operator-=(uint64 x) {
+ *this >>= 2;
+ assert((x & 0xfc) == 0);
+ _md |= (x & uint64NUMBER(0x3)) << _lastShift;
+ };
+
+public:
+ void mask(bool) {
+ _md &= _mask;
+ };
+
+public:
+ bool operator!=(kMerTiny const &r) const { return(_md != r._md); };
+ bool operator==(kMerTiny const &r) const { return(_md == r._md); };
+ bool operator< (kMerTiny const &r) const { return(_md < r._md); };
+ bool operator> (kMerTiny const &r) const { return(_md > r._md); };
+ bool operator<=(kMerTiny const &r) const { return(_md <= r._md); };
+ bool operator>=(kMerTiny const &r) const { return(_md >= r._md); };
+ int qsort_less(kMerTiny const &r) const {
+ if (_md < r._md) return(-1);
+ if (_md > r._md) return( 1);
+ return(0);
+ };
+public:
+ operator uint64 () const {return(_md);};
+
+public:
+ void writeToBitPackedFile(bitPackedFile *BPF, uint32 numBits=0) const {
+ BPF->putBits(_md, _merSize << 1);
+ };
+ void readFromBitPackedFile(bitPackedFile *BPF, uint32 numBits=0) {
+ _md = BPF->getBits(_merSize << 1);
+ };
+
+public:
+ void setBits(uint32 pos, uint32 numbits, uint64 val) {
+ _md &= ~(uint64MASK(numbits) << pos);
+ _md |= val << pos;
+ };
+
+ uint64 getBits(uint32 pos, uint32 numbits) const {
+ return((_md >> pos) & uint64MASK(numbits));
+ };
+
+public:
+ uint64 startOfMer(uint32 bits) const {
+ return(getBits((_merSize << 1) - bits, bits));
+ };
+ uint64 endOfMer(uint32 bits) const {
+ return(_md & uint64MASK(bits));
+ };
+
+public:
+ uint64 getWord(uint32 wrd) const { return(_md); };
+ void setWord(uint32 wrd, uint64 val) { _md = val; };
+
+public:
+ char *merToString(char *instr) const;
+
+private:
+ uint64 _md;
+
+ // The _merSize is always the number of letters in the mer -- if we
+ // are a spaced seed, it is the weight.
+ //
+ uint32 _merSize;
+ uint32 _merSpan;
+
+ // The mask is used to make sure the mer has only _merSize bases
+ // set -- we can get more than that if we shift to the left. The
+ //
+ uint64 _mask;
+
+ // For operator-=() (add a base to the left end) we need to know
+ // what the last word is, and how far to shift the bits.
+ //
+ uint32 _lastShift;
+};
+
+
+
+
+
+inline
+void
+kMerTiny::setMerSize(uint32 ms) {
+ _merSize = ms;
+ _merSpan = ms;
+ _lastShift = (2 * ms - 2) % 64;
+ _mask = uint64MASK(_merSize << 1);
+}
+
+
+inline
+char *
+kMerTiny::merToString(char *str) const {
+ for (uint32 i=0; i<_merSize; i++)
+ str[_merSize-i-1] = bitsToLetter[(_md >> (2*i)) & 0x03];
+ str[_merSize] = 0;
+ return(str);
+}
diff --git a/libbio/merCovering.H b/libbio/merCovering.H
new file mode 100644
index 0000000..52af4d8
--- /dev/null
+++ b/libbio/merCovering.H
@@ -0,0 +1,353 @@
+#ifndef MER_COVERING_H
+#define MER_COVERING_H
+
+// This is an interval list, where the intervals are built using
+// fixed size pieces.
+//
+// It's designed to accept pieces in roughly sorted order.
+//
+// Intervals are stored c-style.
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+
+class merCovering {
+private:
+ class interval {
+ public:
+ uint32 _lo;
+ uint32 _hi;
+ interval *_next;
+
+ interval(uint32 lo, uint32 hi, interval *n) {
+ _lo = lo;
+ _hi = hi;
+ _next = n;
+ }
+ };
+
+ interval *_intervals;
+ uint32 _width;
+ uint32 _pieces;
+#ifdef TEST_MERCOVERING
+ uint32 _test[TEST_SIZE];
+#endif
+
+public:
+ merCovering(uint32 w) {
+ _intervals = 0L;
+ _width = w;
+ _pieces = 0;
+
+#ifdef TEST_MERCOVERING
+ for (uint32 i=0; i<TEST_SIZE; i++)
+ _test[i] = 0;
+#endif
+ };
+
+ ~merCovering() {
+ clear();
+ };
+
+ void clear(void) {
+ interval *i = _intervals;
+
+ while (i) {
+ _intervals = i->_next;
+ delete i;
+ i = _intervals;
+ }
+
+ _intervals = 0L;
+ _pieces = 0;
+ };
+
+ uint32 sumOfLengths(void) {
+ uint32 s=0;
+
+ for (interval *i=_intervals; i; i = i->_next)
+ s += i->_hi - i->_lo;
+
+ return(s);
+ };
+
+ uint32 numberOfPieces(void) {
+ return(_pieces);
+ };
+
+ void addMer(uint32 lo) {
+ _pieces++;
+
+ uint32 hi = lo + _width;
+
+ interval *c;
+
+#ifdef TEST_MERCOVERING
+ for (uint32 i=lo; i<hi; i++)
+ _test[i] = 1;
+#endif
+
+ // Case: No existing intervals, or the new interval extends
+ // the low range of the first interval
+ //
+ if ((_intervals == 0L) ||
+ (hi < _intervals->_lo)) {
+ _intervals = new interval(lo, hi, _intervals);
+ return;
+ }
+
+ c = _intervals;
+
+ while (c) {
+
+ // Case: New interval is completely contained in the current interval.
+ //
+ if ((c->_lo <= lo) && (hi <= c->_hi))
+ return;
+
+ // Case: New interval overlaps the low end of the current interval,
+ // or is completely contained in an existing interval.
+ //
+ if ((lo <= c->_lo) && (hi <= c->_hi)) {
+ c->_lo = lo;
+ return;
+ }
+
+ if (c->_next) {
+
+ // Case: New interval overlaps the high end of the current interval...
+ //
+ if (lo <= c->_hi) {
+
+ if (hi < c->_next->_lo) {
+ // but does not intersect the next interval.
+ //
+ c->_hi = hi;
+ return;
+ } else {
+ // and does intersect the next interval.
+ //
+ interval *p = c->_next;
+
+ c->_hi = c->_next->_hi;
+ c->_next = c->_next->_next;
+
+ delete p;
+ return;
+ }
+ } else {
+ // Case: New interval is between two existing intervals
+ //
+ // (lo > c->_hi) is given
+ //
+ if (hi < c->_next->_lo) {
+ c->_next = new interval(lo, hi, c->_next);
+ return;
+ }
+ }
+ } else {
+ // Case: New interval overlaps the high end of the current interval
+ //
+ if (lo <= c->_hi) {
+ c->_hi = hi;
+ return;
+ } else {
+ // Otherwise, we just fell off the end of all intervals.
+ // Add one at the end.
+ //
+ c->_next = new interval(lo, hi,0L);
+ return;
+ }
+ }
+
+ c = c->_next;
+ }
+
+#ifdef TEST_MERCOVERING
+ fprintf(stderr, "ERROR IN addInterval!\n");
+#endif
+ };
+
+#ifdef TEST_MERCOVERING
+ void test(void) {
+ for (uint32 i=0; i<TEST_SIZE; i++) {
+ if (_test[i])
+ _test[i] = 2;
+ }
+ for (interval *z=_intervals; z; z = z->_next) {
+ for (uint32 i=z->_lo; i<z->_hi; i++) {
+ if (_test[i] == 0) {
+ fprintf(stderr, "INTERVAL CONTAINS SOMETHING NOT IN ARRAY! (%d)\n", i);
+ exit(1);
+ }
+ if (_test[i] == 1) {
+ fprintf(stderr, "INTERVAL HIT SOMETHING TWICE! (%d)\n", i);
+ exit(1);
+ }
+ _test[i] = 1;
+ }
+ }
+ for (uint32 i=0; i<TEST_SIZE; i++) {
+ if (_test[i] == 2) {
+ fprintf(stderr, "ARRAY CONTAINED SOMETHING NOT IN INTERVAL! (%d)\n", i);
+ exit(1);
+ }
+ }
+ };
+#endif
+
+
+
+ // Incorporates the intervals in B into our list.
+ //
+ void merge(merCovering *I = 0L) {
+ interval *A, *B, *N, *L;
+
+ if (I == 0L)
+ return;
+
+ A = _intervals;
+ B = I->_intervals;
+ N = 0L;
+ L = 0L;
+
+ while (A || B) {
+ uint32 lo = 0;
+ uint32 hi = 0;
+
+ // if either list is zero, we can just zip down the other list
+ // and add things.
+ //
+ if (!B) {
+ while (A) {
+ L->_next = new interval(A->_lo, A->_hi, 0L);
+ L = L->_next;
+ A = A->_next;
+ }
+ }
+
+ if (!A) {
+ while (B) {
+ L->_next = new interval(B->_lo, B->_hi, 0L);
+ L = L->_next;
+ B = B->_next;
+ }
+ }
+
+ if (A && B) {
+ if (A->_lo == B->_lo) {
+ // A and B start at the same position
+ //
+ lo = A->_lo;
+ hi = A->_hi;
+ if (hi < B->_hi)
+ hi = B->_hi;
+
+ A = A->_next;
+ B = B->_next;
+ } else {
+ // A and B start at different positions. Pick the first one.
+ //
+ if (A->_lo < B->_lo) {
+ lo = A->_lo;
+ hi = A->_hi;
+ A = A->_next;
+ } else {
+ lo = B->_lo;
+ hi = B->_hi;
+ B = B->_next;
+ }
+ }
+
+ // We have an initial interval. Add more stuff, while there
+ // are overlaps.
+
+ bool modified = true;
+
+ while ((A || B) && (modified)) {
+ modified = false;
+
+ if ((A) && (hi >= A->_lo)) {
+ if (hi < A->_hi)
+ hi = A->_hi;
+ A = A->_next;
+ modified = true;
+ }
+
+ if ((B) && (hi >= B->_lo)) {
+ if (hi < B->_hi)
+ hi = B->_hi;
+ B = B->_next;
+ modified = true;
+ }
+ }
+
+ // OK, got the new interval. Save it.
+ //
+ if (N) {
+ L->_next = new interval(lo, hi, 0L);
+ L = L->_next;
+ } else {
+ N = L = new interval(lo, hi, 0L);
+ }
+ }
+ }
+
+ // Save the number of mers in both intervals
+ //
+ uint32 p = _pieces + I->_pieces;
+
+ clear();
+
+ _intervals = N;
+ _pieces = p;
+ }
+
+
+
+
+#ifdef TEST_MERCOVERING
+ void dump(void) {
+ for (interval *i=_intervals; i; i = i->_next)
+ fprintf(stderr, "%5d-%5d ", i->_lo, i->_hi);
+ fprintf(stderr, "\n");
+ };
+
+ void compare(merCovering *B) {
+ interval *i = _intervals;
+ interval *j = B->_intervals;
+
+ if (_pieces != B->_pieces) {
+ fprintf(stderr, "Pieces differ (this=%d that=%d).\n", _pieces, B->_pieces);
+ exit(1);
+ }
+
+ while (i && j) {
+ if ((i->_lo != j->_lo) || (i->_hi != j->_hi)) {
+ fprintf(stderr, "ERROR!\n");
+ exit(1);
+ }
+
+ i = i->_next;
+ j = j->_next;
+ }
+
+ if (i) {
+ fprintf(stderr, "ERROR (i still exists)!\n");
+ exit(1);
+ }
+
+ if (j) {
+ fprintf(stderr, "ERROR (i still exists)!\n");
+ exit(1);
+ }
+ };
+#endif
+
+};
+
+#endif // MERCOVERING_H
+
+
+
diff --git a/libbio/merList.H b/libbio/merList.H
new file mode 100644
index 0000000..948a2bf
--- /dev/null
+++ b/libbio/merList.H
@@ -0,0 +1,94 @@
+#ifndef MER_LIST_H
+#define MER_LIST_H
+
+// A simple list of mers. Implemented as a list of lists.
+
+class merList {
+public:
+ merList() {
+ _ptrsMax = 8;
+ _ptrsLen = 0;
+ _ptrs = new coord * [_ptrsMax];
+
+ _mersWid = 12;
+ _mersMax = uint32MASK(_mersWid);
+ _mersLen = 0;
+ _ptrs[0] = new coord [_mersMax+1];
+ };
+ ~merList() {
+ for (uint32 x=0; x<_ptrsLen+1; x++)
+ delete [] _ptrs[x];
+ delete [] _ptrs;
+ };
+
+ void addMer(uint32 x, uint32 y) {
+ if (_mersLen > _mersMax) {
+ _ptrsLen++;
+
+ if (_ptrsLen >= _ptrsMax) {
+ _ptrsMax *= 2;
+ coord **p = new coord * [_ptrsMax];
+ memcpy(p, _ptrs, sizeof(coord*) * _ptrsLen);
+ delete [] _ptrs;
+ _ptrs = p;
+ }
+
+ _ptrs[_ptrsLen] = new coord [_mersMax+1];
+ _mersLen = 0;
+ }
+
+ _ptrs[_ptrsLen][_mersLen]._qPos = x;
+ _ptrs[_ptrsLen][_mersLen]._gPos = y;
+
+ _mersLen++;
+ };
+
+ bool getMer(uint32 i, uint32 &x, uint32 &y) {
+ uint32 p = i >> _mersWid;
+ uint32 a = i & _mersMax;
+
+ if ((p > _ptrsLen) || ((p == _ptrsLen) && (a >= _mersLen)))
+ return(false);
+
+ x = _ptrs[(i >> _mersWid)][i & _mersMax]._qPos;
+ y = _ptrs[(i >> _mersWid)][i & _mersMax]._gPos;
+
+ return(true);
+ };
+
+ void clear(void) {
+ // Don't delete the first guy! We write into it blindly!
+ for (uint32 x=1; x<_ptrsLen; x++)
+ delete [] _ptrs[x];
+ _ptrsLen = 0;
+ _mersLen = 0;
+ };
+
+ void merge(merList *ML) {
+ uint32 i, x, y;
+ for (i=0; ML->getMer(i, x, y); i++)
+ addMer(x, y);
+ };
+
+private:
+ struct coord {
+ uint32 _qPos;
+ uint32 _gPos;
+ };
+
+ // The number of mer blocks we have space for, and the current mer
+ // block.
+ //
+ uint32 _ptrsMax;
+ uint32 _ptrsLen;
+ coord **_ptrs;
+
+ // The number of mers available in each block, and the current mer
+ // we are at in the current block (for adding new mers).
+ //
+ uint32 _mersWid;
+ uint32 _mersMax;
+ uint32 _mersLen;
+};
+
+#endif // MER_LIST_H
diff --git a/libbio/mers.h b/libbio/mers.h
new file mode 100644
index 0000000..022144f
--- /dev/null
+++ b/libbio/mers.h
@@ -0,0 +1,63 @@
+#ifndef BIO_MERS_H
+#define BIO_MERS_H
+
+
+
+inline
+uint64
+reverseComplementMer(uint32 ms, uint64 fmer) {
+
+ // The interested reader shall consult bri-bits.h
+
+ // Reverse the mer
+ //
+ uint64 rmer = fmer;
+ rmer = ((rmer >> 2) & 0x3333333333333333llu) | ((rmer << 2) & 0xccccccccccccccccllu);
+ rmer = ((rmer >> 4) & 0x0f0f0f0f0f0f0f0fllu) | ((rmer << 4) & 0xf0f0f0f0f0f0f0f0llu);
+ rmer = ((rmer >> 8) & 0x00ff00ff00ff00ffllu) | ((rmer << 8) & 0xff00ff00ff00ff00llu);
+ rmer = ((rmer >> 16) & 0x0000ffff0000ffffllu) | ((rmer << 16) & 0xffff0000ffff0000llu);
+ rmer = ((rmer >> 32) & 0x00000000ffffffffllu) | ((rmer << 32) & 0xffffffff00000000llu);
+
+ // Complement the bases
+ //
+ rmer ^= 0xffffffffffffffffllu;
+
+ // Shift and mask out the bases not in the mer
+ //
+ rmer >>= 64 - ms * 2;
+ rmer &= uint64MASK(ms * 2);
+ return(rmer);
+}
+
+
+// Used for in seagen/encodedQuery.C (diagnostics) and
+// libbio/kmerhuge.H (in its merToString method).
+inline
+char *
+uint64ToMerString(uint32 ms, uint64 mer, char *str) {
+ for (uint32 i=0; i<ms; i++)
+ str[ms-i-1] = bitsToLetter[(mer >> (2*i)) & 0x03];
+ str[ms] = 0;
+ return(str);
+}
+
+
+#if 0
+#error this is not used anywhere
+inline
+uint64
+stringToMer(uint32 ms, char *str) {
+ uint64 mer = 0L;
+
+ for (uint32 i=0; i<ms; i++) {
+ mer <<= 2;
+ mer |= compressSymbol[str[i]];
+ }
+
+ return(mer);
+}
+#endif
+
+
+
+#endif // BIO_MERS_H
diff --git a/libbio/reversecomplement.c b/libbio/reversecomplement.c
new file mode 100644
index 0000000..737f9b5
--- /dev/null
+++ b/libbio/reversecomplement.c
@@ -0,0 +1,44 @@
+#include "bio.h"
+
+#include <string.h>
+
+// Inplace reverse-complement an ACGT sequence. A pointer the the
+// string is returned.
+//
+char *
+reverseComplementSequence(char *seq, uint32 seqlen) {
+ char *s = seq;
+ char *e = seq + seqlen - 1;
+ char t;
+ uint32 c = seqlen / 2;
+
+ while (c--) {
+ t = complementSymbol[*s];
+ *(s++) = complementSymbol[*e];
+ *(e--) = t;
+ }
+
+ if (s == e)
+ *s = complementSymbol[*s];
+
+ return(seq);
+}
+
+
+// Inplace reverse a string. A pointer the the string is returned.
+//
+char *
+reverseString(char *seq, uint32 seqlen) {
+ char *s = seq;
+ char *e = seq + seqlen - 1;
+ char t;
+ uint32 c = seqlen / 2;
+
+ while (c--) {
+ t = *s;
+ *(s++) = *e;
+ *(e--) = t;
+ }
+
+ return(seq);
+}
diff --git a/libbio/test/Makefile b/libbio/test/Makefile
new file mode 100644
index 0000000..445ffdb
--- /dev/null
+++ b/libbio/test/Makefile
@@ -0,0 +1,98 @@
+
+
+#PROG = test-merstream-from-seqstore
+
+PROG = dump-merstreamfile \
+ test-seqStream \
+ test-chainedSequence \
+ test-fasta-accessor \
+ test-merstream \
+ test-merstreamfile \
+ test-setbits \
+ halign-test
+
+DEAD = test-merstream-speed \
+ test-bigmer-msf \
+
+
+INCLUDE = -I.. -I../../libutil
+LIBS = -L.. -L../../libutil -lbio -lutil -lm
+OBJS =
+
+include ../../Make.compilers
+
+all: $(PROG)
+ @echo Tests passed!
+
+dump-merstreamfile: dump-merstreamfile.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o dump-merstreamfile.o dump-merstreamfile.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o dump-merstreamfile dump-merstreamfile.o $(LIBS)
+
+test-merstream-from-seqstore: test-merstream-from-seqstore.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-merstream-from-seqstore.o test-merstream-from-seqstore.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-merstream-from-seqstore test-merstream-from-seqstore.o $(LIBS)
+
+test-seqStream: test-seqStream.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-seqStream.o test-seqStream.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-seqStream test-seqStream.o $(LIBS)
+ ../../leaff/leaff -G 3 30 40 > junk2.fasta
+ ./test-seqStream junk2.fasta
+ rm -f junk*
+
+test-chainedSequence: test-chainedSequence.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-chainedSequence.o test-chainedSequence.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-chainedSequence test-chainedSequence.o $(LIBS)
+ ./test-chainedSequence
+ ../../leaff/leaff -G 1000 1000 3000 > junk2.fasta
+ ./test-chainedSequence junk2.fasta
+ rm -f junk*
+
+test-fasta-accessor: test-fasta-accessor.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-fasta-accessor.o test-fasta-accessor.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-fasta-accessor test-fasta-accessor.o $(LIBS)
+ ./test-fasta-accessor
+
+test-merstream: test-merstream.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-merstream.o test-merstream.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-merstream test-merstream.o $(LIBS)
+ ../../leaff/leaff -G 1 10000 30000 > junk.fasta
+ ./test-merstream junk.fasta
+ rm -f junk*
+ ../../leaff/leaff -G 1000 10000 30000 > junk.fasta
+ ./test-merstream junk.fasta
+ rm -f junk*
+
+test-merstreamfile: test-merstreamfile.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-merstreamfile.o test-merstreamfile.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-merstreamfile test-merstreamfile.o $(LIBS)
+ #../../leaff/leaff -G 2 50 50 > junk.fasta
+ ../../leaff/leaff -G 100000 10 600 > junk.fasta
+ #../../leaff/leaff -G 10000 10 10000 > junk.fasta
+ #../../leaff/leaff -G 30000 10000 10000 > junk.fasta
+ ./test-merstreamfile junk.fasta
+ #rm -f junk.fasta junk.fastaidx
+
+test-merstream-speed: test-merstream-speed.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-merstream-speed.o test-merstream-speed.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-merstream-speed test-merstream-speed.o $(LIBS)
+ ../../leaff/leaff -G 10000 1000 10000 > junk.fasta
+ cat junk.fasta > /dev/null
+ ./test-merstream-speed junk.fasta
+ rm -f junk*
+
+test-setbits: test-setbits.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-setbits.o test-setbits.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-setbits test-setbits.o $(LIBS)
+ ./test-setbits
+
+test-bigmer-msf: test-bigmer-msf.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-bigmer-msf.o test-bigmer-msf.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-bigmer-msf test-bigmer-msf.o $(LIBS)
+ ./test-bigmer-msf
+
+halign-test: halign-test.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o halign-test.o halign-test.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o halign-test halign-test.o $(LIBS)
+
+clean:
+ rm -f $(PROG) *.o *junk*
diff --git a/libbio/test/halign-test.C b/libbio/test/halign-test.C
new file mode 100644
index 0000000..f7e380b
--- /dev/null
+++ b/libbio/test/halign-test.C
@@ -0,0 +1,48 @@
+#include "bio++.H"
+
+int
+main(int argc, char **argv) {
+
+ const char *s1 = "gattcatggctgaaatcgtgtttgaccagctatgtgtgtctctcaatccgatcaagtagatgtctaaaattaaccgtcagaatatttatgcctgattcatggctgaaattgtgtttgaccagctatgtgtgtctcttaatccactcaagtagatgtctaaaattaaccatcagaatatttatgcctgattcatggctgaaatcacgtttgaccagctatgtgtgtctcttaatccagtcaagtagatgtctaaaattaaccatcagaatatttatgcctgattcatggctgaaatcgtgtttgaccagctatgtgtgtctctcaatccgatcaagtagatgtctgaaattaaccatcagaatatttatgcctgattcatggctgaaatttcaggatgaaagctatgaaatctctatttgtgtttgtgtatctattaatgtatgttatgtatatgtgatattttcttaactcc [...]
+ const char *s2 = "gattcatggctgaaatcatgtttgaccagctatgtgtgtctcttaatccagtcaagtagatgtctaaaattaaccatcagaatatttatgcctgattcatggctgaaatcgtgtttgaccagctatgtgtgtctcttactccactcaagtagatgtctaaaattaaccatcagaatatttatgcctgattcatggctgaaatcatgtttgaccagctatgtgtgtctcttaatccagtcaagtagatgtctaaaattaaccatcagaatatttatgcctgattcatggctgaaatcgtgtttgaccagctatgtgtgtctctcaatccgatcaagtagatgtctgaaattaaccatcagaatatttatgcctgattcatggctgaaatttcaggatgacagctatgaaatctctatttgtgtttgtatatctattaatgtatgttatgtatatgtgatattttcttaactcc [...]
+
+ s1 = "gATTCATGGCTgaaatcgtgtttgaccagctatgtgtgtctctcaatccgatcaagtagatgtctaaaattaaccgtcaGAATATTTATGCCTGATTCATGGCTgaaattgtgtttgaccagctatgtgtgtctcttaatccactcaagtagatgtctaaaattaaccatcaGAATATTTATGCCTGATTCATGGCTgaaatcacgtttgaccagctatgtgtgtctcttaatccagtcaagtagatgtctaaaattaaccatcaGAATATTTATGCCTGATTCATGGCTgaaatcgtgtttgaccagctatgtgtgtctctcaatccgatcaagtagatgtctgaaattaaccatcaGAAtatttatgcctgattcatggctgaaatttcaggatgaaagctatgaaatctctatttgtgtttgtgtatctattaatgtatgttatgtatatgtgatattttcttaactcc [...]
+ s2 = "gATTCATGGCTGAAATCATGTTTGACCAGCTATGTGTGTCTCTTAATCCAGTCAAGTAGATGTCTAAAATTAACCATCAGAATATTTATGCCTGATTCATGGCTGAAATCGTGTTTGACCAGCTATGTGTGTCTCTTACTCCACTCAAGTAGATGTCTAAAATTAACCATCAGAATATTTATGCCTGATTCATGGCTGAAATCATGTTTGACCAGCTATGTGTGTCTCTTAATCCAGTCAAGTAGATGTCTAAAATTAACCATCAGAATATTTATGCCTGATTCATGGCTGAAATCGTGTTTGACCAGCTATGTGTGTCTCTCAATCCGATCAAGTAGATGTCTGAAATTAACCATCAGAATATTTATGCCTGATTCATGGCTGAAATTTCAGGATGACAGCTATGAAATCTCTATTTGTGTTTGTATATCTATTAATGTATGTTATGTATATGTGATATTTTCTTAACTCC [...]
+
+ s1 = "gattcatggctgaaatcgtgtttgaccagctatgtgtgtctctcaatccgatcaagtagatgtctaaaattaaccgtcagaatatttatgcctgattcatggctgaaattgtgtttgaccagctatgtgtgtctcttaatccactcaagtagatgtctaaaattaaccatcagaatatttatgcctgattcatggctgaaatcacgtttgaccagctatgtgtgtctcttaatccagtcaagtagatgtctaaaattaaccatcagaatatttatgcctgattcatggctgaaatcgtgtttgaccagctatgtgtgtctctcaatccgatcaagtagatgtctgaaattaaccatcagaatatttatgcctgattcatggctgaaatttcaggatgaaagctatgaaatctctatttgtgtttgtgtatctattaatgtatgttatgtatatgtgatattttcttaactcc [...]
+ s2 = "gattcatggctgaaatcatgtttgaccagctatgtgtgtctcttaatccagtcaagtagatgtctaaaattaaccatcagaatatttatgcctgattcatggctgaaatcgtgtttgaccagctatgtgtgtctcttactccactcaagtagatgtctaaaattaaccatcagaatatttatgcctgattcatggctgaaatcatgtttgaccagctatgtgtgtctcttaatccagtcaagtagatgtctaaaattaaccatcagaatatttatgcctgattcatggctgaaatcgtgtttgaccagctatgtgtgtctctcaatccgatcaagtagatgtctgaaattaaccatcagaatatttatgcctgattcatggctgaaatttcaggatgacagctatgaaatctctatttgtgtttgtatatctattaatgtatgttatgtatatgtgatattttcttaactcc [...]
+
+ char *a1 = new char [10240];
+ char *a2 = new char [10240];
+
+ halign(s1, s2,
+ strlen(s1), strlen(s2),
+ a1, a2);
+
+ int match = 0;
+ int mismatch = 0;
+ int gap = 0;
+
+ for (int i=0; a1[i]; i++) {
+ if ((a1[i] == '-') || (a2[i] == '-')) {
+ gap++;
+ a1[i] = toupper(a1[i]);
+ a2[i] = toupper(a2[i]);
+ } else if (a1[i] != a2[i]) {
+ mismatch++;
+ a1[i] = toupper(a1[i]);
+ a2[i] = toupper(a2[i]);
+ } else {
+ match++;
+ a1[i] = tolower(a1[i]);
+ a2[i] = tolower(a2[i]);
+ }
+ }
+
+ fprintf(stdout, "a1 = %s\n", a1);
+ fprintf(stdout, "a2 = %s\n", a2);
+
+ fprintf(stdout, "mismatch=%d gap=%d match=%d\n", mismatch, gap, match);
+}
+
+
diff --git a/libbio/test/test-bigmer-msf.C b/libbio/test/test-bigmer-msf.C
new file mode 100644
index 0000000..cb31fe3
--- /dev/null
+++ b/libbio/test/test-bigmer-msf.C
@@ -0,0 +1,222 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#include "bio++.H"
+
+// Build a merStreamFile using small mers, read it back using bigger mers.
+//
+// construct a fasta sequence:
+// small sequence
+// big sequence, multiple of mersize
+// small sequence
+// big sequence
+// etc
+// small sequence
+//
+// Then we reconstruct the sequences using mers. All three merstream
+// sources are tested (the character string source is not tested).
+// The merStreamFile is tested both forwards (nextMer(), via the
+// merstream interface) and backwards (setIterationStart()).
+
+#define BUILD_SIZE 88
+#define TEST_SIZE 403
+#define MERS_PER_SEQ 37
+#define TEST_ITERATIONS 300
+
+#define MSF_FILENAME "junk.bigmer"
+#define FASTA_FILENAME "junk.bigmer.fasta"
+
+// construct a multi-fasta sequence, alternating short and long
+// sequences. Short sequences are less than TEST_SIZE long (most,
+// not all, longer than BUILD_SIZE). Long sequences are exactly
+// TEST_SIZE * MERS_PER_SEQ long -- this lets us dump the mers to
+// reconstruct the sequence.
+//
+void
+buildFastA(void) {
+ mt_s *mtctx = mtInit(time(0L));
+ char *seq = new char [TEST_SIZE * MERS_PER_SEQ + 1];
+ char dna[4] = { 'A', 'C', 'G', 'T' };
+
+ FILE *F = fopen(FASTA_FILENAME, "w");
+
+ for (uint32 i=0; i<TEST_ITERATIONS; i++) {
+ uint32 len;
+
+ fprintf(F, ">"uint32FMT"short\n", i);
+ len = mtRandom32(mtctx) % (TEST_SIZE-1) + 1;
+ for (uint32 s=0; s<len; s++)
+ seq[s] = dna[ mtRandom32(mtctx) % 4 ];
+ seq[len] = 0;
+ fprintf(F, "%s\n", seq);
+
+ fprintf(F, ">"uint32FMT"long\n", i);
+ len = TEST_SIZE * MERS_PER_SEQ;
+ for (uint32 s=0; s<len; s++)
+ seq[s] = dna[ mtRandom32(mtctx) % 4 ];
+ seq[len] = 0;
+ fprintf(F, "%s\n", seq);
+ }
+
+ fclose(F);
+}
+
+
+// Uses the merStreamFile directly to read mers, chains mers
+// into a sequence, compares against the correct sequence.
+//
+void
+test1(uint32 style) {
+ seqCache *fasta = new seqCache(FASTA_FILENAME);
+ seqInCore *sseq = fasta->getSequenceInCore();
+ seqInCore *lseq = fasta->getSequenceInCore();
+
+ char mseq[TEST_SIZE * MERS_PER_SEQ + 1];
+
+ // Construct a reader, and load the first mer.
+
+ merStream *MS = 0L;
+ merStreamFileReader *RD = 0L;
+ chainedSequence *CS = 0L;
+
+ switch (style) {
+ case 0:
+ fprintf(stderr, "test1(0)-- Testing merStreamFileReader -> merStream\n");
+ RD = new merStreamFileReader(MSF_FILENAME, TEST_SIZE);
+ MS = new merStream(RD);
+ break;
+ case 1:
+ fprintf(stderr, "test1(2)-- Testing chainedSequence -> merStream\n");
+ CS = new chainedSequence();
+ CS->setSource(FASTA_FILENAME);
+ CS->finish();
+ MS = new merStream(TEST_SIZE, CS);
+ break;
+ case 2:
+ fprintf(stderr, "test1(3)-- Testing merStreamFileReader (backwards)\n");
+ RD = new merStreamFileReader(MSF_FILENAME, TEST_SIZE);
+ break;
+ default:
+ break;
+ }
+
+
+ for (uint32 s=0; fasta->eof() == false; s++) {
+ for (uint32 i=0; i<TEST_SIZE * MERS_PER_SEQ + 1; i++)
+ mseq[i] = 0;
+
+ switch (style) {
+ case 0:
+ case 1:
+ // Fill the sequence using non-overlapping mers, skipping
+ // intermediate mers (there aren't intermediate mers if we're the
+ // last mer in the sequence!)
+ //
+ MS->nextMer();
+ for (uint32 i=0; i<MERS_PER_SEQ; i++) {
+ MS->theFMer().merToString(mseq + i * TEST_SIZE);
+ if (i != MERS_PER_SEQ-1)
+ MS->nextMer(TEST_SIZE - 1);
+ }
+ break;
+ case 2:
+ // Same thing, but read the mers backwards -- we could read
+ // the sequences backwards, too, but that doesn't gain us
+ // anything (we still seek to every location).
+ //
+ for (uint32 i=MERS_PER_SEQ; i--; ) {
+ char copy[TEST_SIZE + 1];
+ RD->setIterationStart(s * (MERS_PER_SEQ * TEST_SIZE - TEST_SIZE + 1) + i * (TEST_SIZE));
+ RD->nextMer();
+ RD->theFMer().merToString(copy);
+ strncpy(mseq + i * TEST_SIZE, copy, TEST_SIZE);
+
+ // Aww, what the hell! Test reverse complement stuff too!
+ //
+ kMer f = RD->theFMer();
+ kMer r = RD->theRMer();
+ f.reverseComplement();
+
+ if (f != r) {
+ char str[1025];
+ fprintf(stderr, "Reverse Complement mismatch:\n");
+ fprintf(stderr, " reversed fwd = '%s'\n", f.merToString(str));
+ fprintf(stderr, " rev = '%s'\n", r.merToString(str));
+ exit(1);
+ }
+
+ f = RD->theFMer();
+ r = RD->theRMer();
+ r.reverseComplement();
+
+ if (f != r) {
+ char str[1025];
+ fprintf(stderr, "Reverse Complement mismatch:\n");
+ fprintf(stderr, " fwd = '%s'\n", f.merToString(str));
+ fprintf(stderr, " reversed rev = '%s'\n", r.merToString(str));
+ exit(1);
+ }
+
+
+ }
+ mseq[MERS_PER_SEQ * TEST_SIZE] = 0;
+ break;
+ default:
+ break;
+ }
+
+ // Compare our mer-constructed sequence to the long sequence in
+ // the file
+ //
+ if (strcmp(mseq, lseq->sequence()) != 0) {
+ fprintf(stderr, "FAIL: seq="uint32FMT"\nmseq=%s\nlseq=%s\n", s, mseq, lseq->sequence());
+ exit(1);
+ }
+
+ delete sseq;
+ delete lseq;
+
+ sseq = fasta->getSequenceInCore();
+ lseq = fasta->getSequenceInCore();
+ }
+
+ delete sseq;
+ delete lseq;
+
+ delete CS;
+ delete RD;
+ delete MS;
+
+ fprintf(stderr, " OK!\n");
+}
+
+
+
+
+int
+main(int argc, char **argv) {
+
+ // Minimum KMER_WORDS is 13 -- mersizes up to 416 bases
+ if (KMER_WORDS < 13) {
+ fprintf(stderr, "I need at least KMER_WORDS == 13; test not run.\n");
+ exit(0);
+ }
+
+ buildFastA();
+
+ merStreamFileBuilder *B = new merStreamFileBuilder(BUILD_SIZE, FASTA_FILENAME, MSF_FILENAME);
+ B->build(true);
+ delete B;
+
+ test1(0);
+ test1(1);
+ test1(2);
+
+ unlink(FASTA_FILENAME);
+ unlink(FASTA_FILENAME "idx");
+ unlink(MSF_FILENAME ".merStream");
+
+ exit(0);
+}
diff --git a/libbio/test/test-setbits.C b/libbio/test/test-setbits.C
new file mode 100644
index 0000000..1cb1990
--- /dev/null
+++ b/libbio/test/test-setbits.C
@@ -0,0 +1,28 @@
+#include "bio++.H"
+
+//g++ -o test-setbits test-setbits.C -I../libutil -I. -L../libutil -L. -lbio -lutil
+
+int
+main(int argc, char **argv) {
+
+ kMer x(96);
+ char str[256];
+
+ if (KMER_WORDS < 3) {
+ fprintf(stderr, "I need at least KMER_WORDS == 3; test not run.\n");
+ exit(0);
+ }
+
+ for (uint32 i=0; i<168; i++) {
+ x.clear();
+ x.setBits(i, 24, 0x535);
+ fprintf(stderr, uint32FMTW(3)" -- %s -- "uint64HEX"\n", i, x.merToString(str), x.getBits(i, 16));
+
+ if (x.getBits(i, 16) != 0x535) {
+ fprintf(stderr, "decode error.\n");
+ exit(1);
+ }
+ }
+
+ exit(0);
+}
diff --git a/libkmer/Make.include b/libkmer/Make.include
new file mode 100644
index 0000000..36ee1c5
--- /dev/null
+++ b/libkmer/Make.include
@@ -0,0 +1,35 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBMERYL/ :=$(realpath $/../libmeryl/)/
+
+src := $/existDB-create-from-fasta.C \
+ $/existDB-create-from-meryl.C \
+ $/existDB-create-from-sequence.C \
+ $/existDB-state.C \
+ $/existDB.C \
+ $/existDB.H \
+ $/merTable.H \
+ $/positionDB-access.C \
+ $/positionDB-dump.C \
+ $/positionDB-file.C \
+ $/positionDB-mismatch.C \
+ $/positionDB-sort.C \
+ $/positionDB.C \
+ $/positionDB.H
+
+$/.CXX_SRCS := $(filter %.C,${src}) $/driver-existDB.C $/driver-posDB.C $/percentCovered.C $/kmer-mask.C
+$/.CXX_INCS := $(filter %.H,${src})
+$/.CXX_EXES := $/existDB $/positionDB $/percentCovered $/kmer-mask
+$/.CXX_LIBS := $/libkmer.a
+$/.CLEAN := $/*.o
+
+$/libkmer.a: $(filter %.o,${src:.C=.o})
+$/existDB: $/driver-existDB.o $/libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/positionDB: $/driver-posDB.o $/libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/percentCovered: $/percentCovered.o $/libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/kmer-mask: $/kmer-mask.o $/libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+= -I${LIBMERYL/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/libkmer/driver-existDB.C b/libkmer/driver-existDB.C
new file mode 100644
index 0000000..1ad7230
--- /dev/null
+++ b/libkmer/driver-existDB.C
@@ -0,0 +1,227 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "bio++.H"
+#include "existDB.H"
+#include "libmeryl.H"
+
+#include "seqCache.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+// Driver for the existDB creation. Reads a sequence.fasta, builds
+// an existDB for the mers in the file, and then writes the internal
+// structures to disk.
+//
+// The existDB constructor is smart enough to read either a pre-built
+// image or a regular multi-fasta file.
+
+
+int
+testFiles(char *filename, char *prefix, uint32 merSize) {
+ char *prefixfilename = new char [strlen(prefix) + 32];
+
+ // Create existDB e and save it to disk
+ //
+ existDB *e = new existDB(filename, merSize, existDBnoFlags | existDBcounts, 0, ~uint32ZERO);
+ sprintf(prefixfilename, "%s.1", prefix);
+ e->saveState(prefixfilename);
+
+ // Create existDB f by loading the saved copy from disk
+ //
+ existDB *f = new existDB(prefixfilename);
+
+ // Create a fresh existDB g (to check if we corrup the original when saved)
+ //
+ existDB *g = new existDB(filename, merSize, existDBnoFlags | existDBcounts, 0, ~uint32ZERO);
+
+ speedCounter *C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, true);
+ fprintf(stderr, "Need to iterate over %7.2f Mmers.\n", (uint64MASK(2 * merSize) + 1) / 1000000.0);
+
+ for (uint64 d=0, m=uint64MASK(2 * merSize); m--; ) {
+ bool ee = e->exists(m);
+ bool ef = f->exists(m);
+ bool eg = g->exists(m);
+
+ uint32 ce = e->count(m);
+ uint32 cf = f->count(m);
+ uint32 cg = g->count(m);
+
+ if ((ee != ef) || (ef != eg) || (ee != eg))
+ fprintf(stderr, "mer "uint64HEX" not found : e=%d f=%d g=%d\n", m, ee, ef, eg);
+
+ if ((ce != cf) || (cf != cg) || (ce != cg))
+ fprintf(stderr, "mer "uint64HEX" count differs : e=%u f=%u g=%u (exists=%d)\n", m, ce, cf, cg, ee);
+
+ if ((m & 0xffffff) == 0) {
+ // Been a while since a report, so report.
+ d = 1;
+ }
+
+ if ((ce > 1) && (d == 1)) {
+ // Report anything not unique, to make sure that we're testing real counts and not just existence.
+ fprintf(stderr, "mer "uint64HEX" : e=%u f=%u g=%u (exists=%d)\n", m, ce, cf, cg, ee);
+ d = 0;
+ }
+
+ C->tick();
+ }
+
+ delete e;
+ delete C;
+
+ return(0);
+}
+
+
+int
+testExistence(char *filename, uint32 merSize) {
+ existDB *E = new existDB(filename, merSize, existDBnoFlags, 0, ~uint32ZERO);
+ merStream *M = new merStream(new kMerBuilder(merSize), new seqStream(filename), true, true);
+ uint64 tried = 0;
+ uint64 lost = 0;
+
+ while (M->nextMer()) {
+ tried++;
+ if (!E->exists(M->theFMer()))
+ lost++;
+ }
+
+ delete M;
+ delete E;
+
+ if (lost) {
+ fprintf(stderr, "Tried "uint64FMT", didn't find "uint64FMT" merStream mers in the existDB.\n", tried, lost);
+ return(1);
+ } else {
+ return(0);
+ }
+}
+
+
+
+int
+testExhaustive(char *filename, char *merylname, uint32 merSize) {
+ existDB *E = new existDB(filename, merSize, existDBnoFlags, 0, ~uint32ZERO);
+ merylStreamReader *M = new merylStreamReader(merylname);
+ speedCounter *C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, true);
+ uint64 found = uint64ZERO;
+ uint64 expected = uint64ZERO;
+
+ FILE *DUMP = 0L;
+
+ DUMP = fopen("testExhaustive.ms.dump", "w");
+
+ while (M->nextMer()) {
+ if (E->exists(M->theFMer())) {
+ expected++;
+ fprintf(DUMP, uint64HEX"\n", (uint64)M->theFMer());
+ } else {
+ fprintf(DUMP, uint64HEX" MISSED!\n", (uint64)M->theFMer());
+ }
+ }
+
+ fclose(DUMP);
+
+ fprintf(stderr, "Found "uint64FMT" mers in the meryl database.\n", expected);
+ fprintf(stderr, "Need to iterate over %7.2f Mmers.\n", (uint64MASK(2 * merSize) + 1) / 1000000.0);
+
+ DUMP = fopen("testExhaustive.ck.dump", "w");
+
+ for (uint64 m = uint64MASK(2 * merSize); m--; ) {
+ if (E->exists(m)) {
+ found++;
+ fprintf(DUMP, uint64HEX"\n", m);
+ }
+ C->tick();
+ }
+
+ fclose(DUMP);
+
+ delete C;
+ delete E;
+ delete M;
+
+ if (expected != found) {
+ fprintf(stderr, "Expected to find "uint64FMT" mers, but found "uint64FMT" instead.\n",
+ expected, found);
+ return(1);
+ } else {
+ return(0);
+ }
+}
+
+
+const char *usage =
+"usage: %s [stuff]\n"
+" -mersize mersize\n"
+" -- Use the specified mersize when building existDB tables.\n"
+"\n"
+" -build some.fasta prefix\n"
+" -- Build an existDB on all mers in some.fasta and save\n"
+" the tables into prefix.\n"
+"\n"
+" -describe prefix\n"
+" -- Reports the state of some existDB file.\n"
+"\n"
+" -testfiles some.fasta prefix\n"
+" -- Build an existDB table from some.fasta. Write that table to disk.\n"
+" Load the table back. Compare that each mer in some.fasta is present\n"
+" in all three existDB tables created earlier.\n"
+"\n"
+" -testexistence some.fasta\n"
+" -- Build an existDB table from some.fasta, check that every\n"
+" mer in some.fasta can be found in the table. Does not\n"
+" guarantee that every mer in the table is found in the file.\n"
+"\n"
+" -testexhaustive some.fasta some.meryl\n"
+" -- Build an existDB table from some.fasta, check _EVERY_ mer\n"
+" for existance. Complain if a mer exists in the table but\n"
+" not in the meryl database. Assumes 'some.meryl' is the\n"
+" mercount of some.fasta.\n"
+"\n";
+
+int
+main(int argc, char **argv) {
+ uint32 mersize = 20;
+
+ if (argc < 3) {
+ fprintf(stderr, usage, argv[0]);
+ exit(1);
+ }
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-mersize", 2) == 0) {
+ arg++;
+ mersize = atoi(argv[arg]);
+
+ } else if (strncmp(argv[arg], "-describe", 2) == 0) {
+ existDB *e = new existDB(argv[argc-1], false);
+ e->printState(stdout);
+ delete e;
+ exit(0);
+
+ } else if (strncmp(argv[arg], "-testfiles", 8) == 0) {
+ exit(testFiles(argv[arg+1], argv[arg+2], mersize));
+
+ } else if (strncmp(argv[arg], "-testexistence", 8) == 0) {
+ exit(testExistence(argv[arg+1], mersize));
+
+ } else if (strncmp(argv[arg], "-testexhaustive", 8) == 0) {
+ exit(testExhaustive(argv[arg+1], argv[arg+2], mersize));
+
+ } else if (strncmp(argv[arg], "-build", 2) == 0) {
+ existDB *e = new existDB(argv[argc-2], mersize, existDBnoFlags, 0, ~uint32ZERO);
+ e->saveState(argv[argc-1]);
+ delete e;
+ exit(0);
+ }
+
+ arg++;
+ }
+
+ exit(0);
+}
diff --git a/libkmer/driver-posDB.C b/libkmer/driver-posDB.C
new file mode 100644
index 0000000..b78cb91
--- /dev/null
+++ b/libkmer/driver-posDB.C
@@ -0,0 +1,287 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "bio++.H"
+#include "existDB.H"
+#include "positionDB.H"
+
+#include "seqCache.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+// Driver for the positionDB creation. Reads a sequence.fasta, builds
+// a positionDB for the mers in the file, and then writes the internal
+// structures to disk.
+//
+// The positionDB constructor is smart enough to read either a pre-built
+// image or a regular multi-fasta file.
+
+
+#define MERSIZE 20
+
+
+int
+test1(char *filename) {
+ merStream *T = new merStream(new kMerBuilder(MERSIZE), new seqStream(filename), true, true);
+ positionDB *M = new positionDB(T, MERSIZE, 0, 0L, 0L, 0L, 0, 0, 0, 0, true);
+ uint64 *posn = new uint64 [1024];
+ uint64 posnMax = 1024;
+ uint64 posnLen = uint64ZERO;
+ uint64 count = uint64ZERO;
+ uint32 missing = uint32ZERO;
+ uint32 failed = uint32ZERO;
+ char str[33];
+
+ T->rewind();
+
+ while (T->nextMer()) {
+ if (M->getExact(T->theFMer(),
+ posn,
+ posnMax,
+ posnLen,
+ count)) {
+
+ missing = uint32ZERO;
+ for (uint32 i=0; i<posnLen; i++)
+ if (posn[i] == T->thePositionInStream())
+ missing++;
+
+ if (missing != 1) {
+ failed++;
+
+ fprintf(stdout, "%s @ "uint64FMT"/"uint64FMT": Found "uint64FMT" table entries, and "uint32FMT" matching positions (",
+ T->theFMer().merToString(str), T->theSequenceNumber(), T->thePositionInStream(), posnLen, missing);
+
+ for (uint32 i=0; i<posnLen; i++) {
+ fprintf(stdout, uint64FMT, posn[i]);
+ if (i < posnLen - 1)
+ fprintf(stdout, " ");
+ else
+ fprintf(stdout, ")\n");
+ }
+ }
+ } else {
+ failed++;
+
+ fprintf(stdout, "Found no matches for mer=%s at pos="uint64FMT"\n",
+ T->theFMer().merToString(str), T->thePositionInStream());
+ }
+ }
+
+ delete M;
+ delete T;
+
+ return(failed != 0);
+}
+
+
+
+int
+test2(char *filename, char *query) {
+ merStream *T = new merStream(new kMerBuilder(MERSIZE), new seqStream(filename), true, true);
+ positionDB *M = new positionDB(T, MERSIZE, 0, 0L, 0L, 0L, 0, 0, 0, 0, true);
+ uint64 *posn = new uint64 [1024];
+ uint64 posnMax = 1024;
+ uint64 posnLen = uint64ZERO;
+ uint64 count = uint64ZERO;
+ char str[33];
+
+ delete T;
+
+ T = new merStream(new kMerBuilder(MERSIZE), new seqStream(query), true, true);
+
+ while (T->nextMer()) {
+ if (M->getExact(T->theFMer(),
+ posn,
+ posnMax,
+ posnLen,
+ count)) {
+ fprintf(stdout, "Got a F match for mer=%s at "uint64FMT"/"uint64FMT" (in mers), numMatches="uint64FMT"\n",
+ T->theFMer().merToString(str), T->theSequenceNumber(), T->thePositionInStream(), posnLen);
+ }
+
+ if (M->getExact(T->theRMer(),
+ posn,
+ posnMax,
+ posnLen,
+ count)) {
+ fprintf(stdout, "Got a R match for mer=%s at "uint64FMT"/"uint64FMT" (in mers), numMatches="uint64FMT"\n",
+ T->theRMer().merToString(str), T->theSequenceNumber(), T->thePositionInStream(), posnLen);
+ }
+ }
+
+ delete M;
+ delete T;
+
+ return(0);
+}
+
+
+
+// Builds a positionDB possibly using a subset of the file.
+//
+// Subset on entire sequences:
+// -use x-y,a,b
+//
+// Subset on a range of mers, in this case, use only the 1000th
+// through 1999th (inclusive) mer:
+// -merbegin 1000 -merend 2000
+//
+// Or do both, use the first 1000 mers from the 3rd sequence:
+// -use 3 -merbegin 0 -merend 1000
+
+
+
+int
+main(int argc, char **argv) {
+ uint32 mersize = 20;
+ uint32 merskip = 0;
+
+ char *maskF = 0L;
+ char *onlyF = 0L;
+
+ uint64 merBegin = ~uint64ZERO;
+ uint64 merEnd = ~uint64ZERO;
+
+ char *sequenceFile = 0L;
+ char *outputFile = 0L;
+
+ if (argc < 3) {
+ fprintf(stderr, "usage: %s [args]\n", argv[0]);
+ fprintf(stderr, " -mersize k The size of the mers, default=20.\n");
+ fprintf(stderr, " -merskip k The skip between mers, default=0\n");
+ fprintf(stderr, " -use a-b,c Specify which sequences to use, default=all\n");
+ fprintf(stderr, " -merbegin b Build on a subset of the mers, starting at mer #b, default=all mers\n");
+ fprintf(stderr, " -merend e Build on a subset of the mers, ending at mer #e, default=all mers\n");
+ fprintf(stderr, " -sequence s.fasta Input sequences.\n");
+ fprintf(stderr, " -output p.posDB Output filename.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " To dump information about an image:\n");
+ fprintf(stderr, " -dump datafile\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " To run sanity tests:\n");
+ fprintf(stderr, " -buildonly [build opts] sequence.fasta\n");
+ fprintf(stderr, " -- just builds a table and exits\n");
+ fprintf(stderr, " -existence [build opts] sequence.fasta\n");
+ fprintf(stderr, " -- builds (or reads) a table reports if any mers\n");
+ fprintf(stderr, " in sequence.fasta cannot be found\n");
+ fprintf(stderr, " -extra [build opts] sequence.fasta\n");
+ fprintf(stderr, " -- builds (or reads) a table reports if any mers\n");
+ fprintf(stderr, " NOT in sequence.fasta are be found\n");
+ fprintf(stderr, " -test1 sequence.fasta\n");
+ fprintf(stderr, " -- Tests if each and every mer is found in the\n");
+ fprintf(stderr, " positionDB. Reports if it doesn't find a mer\n");
+ fprintf(stderr, " at the correct position. Doesn't report if table\n");
+ fprintf(stderr, " has too much stuff.\n");
+ fprintf(stderr, " -test2 db.fasta sequence.fasta\n");
+ fprintf(stderr, " -- Builds a positionDB from db.fasta, then searches\n");
+ fprintf(stderr, " the table for each mer in sequence.fasta. Reports\n");
+ fprintf(stderr, " all mers it finds.\n");
+ fprintf(stderr, " -- This is a silly test and you shouldn't do it.\n");
+ exit(1);
+ }
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-mersize") == 0) {
+ mersize = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-merskip") == 0) {
+ merskip = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-mask") == 0) {
+ maskF = argv[++arg];
+ } else if (strcmp(argv[arg], "-only") == 0) {
+ onlyF = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-merbegin") == 0) {
+ merBegin = strtouint64(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-merend") == 0) {
+ merEnd = strtouint64(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-sequence") == 0) {
+ sequenceFile = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-output") == 0) {
+ outputFile = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-dump") == 0) {
+ positionDB *e = new positionDB(argv[++arg], 0, 0, 0, false);
+ e->printState(stdout);
+ delete e;
+ exit(0);
+ } else if (strcmp(argv[arg], "-test1") == 0) {
+ exit(test1(argv[arg+1]));
+ } else if (strcmp(argv[arg], "-test2") == 0) {
+ exit(test2(argv[arg+1], argv[arg+2]));
+ } else {
+ fprintf(stderr, "ERROR: unknown arg '%s'\n", argv[arg]);
+ exit(1);
+ }
+
+ arg++;
+ }
+
+ // Exit quickly if the output file exists.
+ //
+ if (fileExists(outputFile)) {
+ fprintf(stderr, "Output file '%s' exists already!\n", outputFile);
+ exit(0);
+ }
+
+
+ merStream *MS = new merStream(new kMerBuilder(MERSIZE),
+ new seqStream(sequenceFile),
+ true, true);
+
+ // Approximate the number of mers in the sequences.
+ //
+ uint64 numMers = MS->approximateNumberOfMers();
+
+ // Reset the limits.
+ //
+ // XXX: If the user somehow knows how many mers are in the input
+ // file, and specifies an end between there and the amount of
+ // sequence, we'll pointlessly still make a merStreamFile, even
+ // though we shouldn't.
+ //
+ if (merBegin == ~uint64ZERO) merBegin = 0;
+ if (merEnd == ~uint64ZERO) merEnd = numMers;
+
+ if (merBegin >= merEnd) {
+ fprintf(stderr, "ERROR: merbegin="uint64FMT" and merend="uint64FMT" are incompatible.\n",
+ merBegin, merEnd);
+ exit(1);
+ }
+
+ if ((merBegin > 0) || (merEnd < numMers))
+ MS->setBaseRange(merBegin, merEnd);
+
+ existDB *maskDB = 0L;
+ if (maskF) {
+ fprintf(stderr, "Building maskDB from '%s'\n", maskF);
+ maskDB = new existDB(maskF, mersize, existDBnoFlags, 0, ~uint32ZERO);
+ }
+
+ existDB *onlyDB = 0L;
+ if (onlyF) {
+ fprintf(stderr, "Building onlyDB from '%s'\n", onlyF);
+ onlyDB = new existDB(onlyF, mersize, existDBnoFlags, 0, ~uint32ZERO);
+ }
+
+ fprintf(stderr, "Building table with merSize "uint32FMT", merSkip "uint32FMT"\n", mersize, merskip);
+
+ positionDB *positions = new positionDB(MS, mersize, merskip, maskDB, onlyDB, 0L, 0, 0, 0, 0, true);
+
+ fprintf(stderr, "Dumping positions table to '%s'\n", outputFile);
+
+ positions->saveState(outputFile);
+
+ delete MS;
+ delete positions;
+
+ exit(0);
+}
diff --git a/libkmer/existDB-create-from-fasta.C b/libkmer/existDB-create-from-fasta.C
new file mode 100644
index 0000000..6d2def4
--- /dev/null
+++ b/libkmer/existDB-create-from-fasta.C
@@ -0,0 +1,271 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include "existDB.H"
+#include "bio++.H"
+#include "seqCache.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+bool
+existDB::createFromFastA(char const *filename,
+ uint32 merSize,
+ uint32 flags) {
+
+ bool beVerbose = false;
+ bool rebuilding = false;
+
+ _hashTable = 0L;
+ _buckets = 0L;
+ _counts = 0L;
+
+ _merSizeInBases = merSize;
+
+ _searchForDupe = true;
+
+ if ((flags & existDBcompressHash) ||
+ (flags & existDBcompressBuckets) ||
+ (flags & existDBcompressCounts))
+ fprintf(stderr, "existDB::createFromSequence: compression not supported.\n"), exit(1);
+
+ // This (at =22) eats up 16MB, and should allow a lot of mers at big sizes. Unfortunately, we
+ // know nothing about how man mers are going to be in the input.
+ //
+ // Setting this too high drastically reduces performance, suspected because of cache misses.
+ // Setting this too low will also reduce performance, by increasing the search time in a bucket.
+ //
+ uint32 tblBits = logBaseTwo64(sizeOfFile(filename));
+
+ rebuild:
+ _shift1 = 2 * _merSizeInBases - tblBits;
+ _shift2 = _shift1 / 2;
+ _mask1 = uint64MASK(tblBits);
+ _mask2 = uint64MASK(_shift1);
+
+ _hshWidth = uint32ZERO;
+ _chkWidth = 2 * merSize - tblBits;
+ _cntWidth = 16;
+
+ uint64 tableSizeInEntries = uint64ONE << tblBits;
+ uint64 numberOfMers = uint64ZERO;
+ uint64 *countingTable = new uint64 [tableSizeInEntries + 1];
+
+ for (uint64 i=tableSizeInEntries+1; i--; )
+ countingTable[i] = 0;
+
+ _isCanonical = flags & existDBcanonical;
+ _isForward = flags & existDBforward;
+
+ assert(_isCanonical + _isForward == 1);
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 1) Count bucket sizes
+ //
+ merStream *M = new merStream(new kMerBuilder(_merSizeInBases),
+ new seqStream(filename),
+ true, true);
+
+ while (M->nextMer()) {
+ if (_isForward) {
+ countingTable[ HASH(M->theFMer()) ]++;
+ numberOfMers++;
+ }
+
+ if (_isCanonical) {
+ countingTable[ HASH(M->theCMer()) ]++;
+ numberOfMers++;
+ }
+ }
+
+ delete M;
+
+#ifdef STATS
+ uint64 dist[32] = {0};
+ uint64 maxcnt = 0;
+ for (uint64 i=tableSizeInEntries+1; i--; ) {
+ if (countingTable[i] > maxcnt)
+ maxcnt = countingTable[i];
+
+ if (countingTable[i] < 32)
+ dist[countingTable[i]]++;
+ }
+
+ for(uint64 i=0; i<32; i++)
+ fprintf(stderr, "existDB::usage[%2d] = %d\n", i, dist[i]);
+ fprintf(stderr, "existDB::maxcnt = %d\n", maxcnt);
+#endif
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // Determine how many bits we need to hold the value
+ // numberOfMers.....then....
+ //
+ // This is numberOfMers+1 because we need to store the
+ // first position after the last mer. That is, if there are two
+ // mers, we will store that the first mer is at position 0, the
+ // second mer is at position 1, and the end of the second mer is at
+ // position 2.
+ //
+ if (_compressedHash) {
+ _hshWidth = 1;
+ while ((numberOfMers+1) > (uint64ONE << _hshWidth))
+ _hshWidth++;
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 2) Allocate a hash table and some mer storage buckets.
+ //
+ _hashTableWords = tableSizeInEntries + 2;
+ if (_compressedHash)
+ _hashTableWords = _hashTableWords * _hshWidth / 64 + 1;
+
+ _bucketsWords = numberOfMers + 2;
+ if (_compressedBucket)
+ _bucketsWords = _bucketsWords * _chkWidth / 64 + 1;
+
+ _countsWords = numberOfMers + 2;
+ if (_compressedCounts)
+ _countsWords = _countsWords * _cntWidth / 64 + 1;
+
+ if (beVerbose) {
+ fprintf(stderr, "existDB::createFromFastA()-- hashTable is "uint64FMT"MB\n", _hashTableWords >> 17);
+ fprintf(stderr, "existDB::createFromFastA()-- buckets is "uint64FMT"MB\n", _bucketsWords >> 17);
+ if (flags & existDBcounts)
+ fprintf(stderr, "existDB::createFromFastA()-- counts is "uint64FMT"MB\n", _countsWords >> 17);
+ }
+
+ _hashTable = new uint64 [_hashTableWords];
+ _buckets = new uint64 [_bucketsWords];
+ _countsWords = (flags & existDBcounts) ? _countsWords : 0;
+ _counts = (flags & existDBcounts) ? new uint64 [_countsWords] : 0L;
+
+ // These aren't strictly needed. _buckets is cleared as it is initialied. _hashTable
+ // is also cleared as it is initialized, but in the _compressedHash case, the last
+ // few words might be uninitialized. They're unused.
+ //
+ //memset(_hashTable, 0, sizeof(uint64) * _hashTableWords);
+ //memset(_buckets, 0, sizeof(uint64) * _bucketsWords); // buckets is cleared as it is built
+ //memset(_counts, 0, sizeof(uint64) * _countsWords);
+
+ _hashTable[_hashTableWords-1] = 0;
+ _hashTable[_hashTableWords-2] = 0;
+ _hashTable[_hashTableWords-3] = 0;
+ _hashTable[_hashTableWords-4] = 0;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // Make the hash table point to the start of the bucket, and reset
+ // the counting table -- we're going to use it to fill the buckets.
+ //
+ uint64 tmpPosition = 0;
+ uint64 begPosition = 0;
+ uint64 ptr = 0;
+
+ if (_compressedHash) {
+ for (uint64 i=0; i<tableSizeInEntries; i++) {
+ tmpPosition = countingTable[i];
+ countingTable[i] = begPosition;
+
+ setDecodedValue(_hashTable, ptr, _hshWidth, begPosition);
+ ptr += _hshWidth;
+
+ begPosition += tmpPosition;
+ }
+
+ setDecodedValue(_hashTable, ptr, _hshWidth, begPosition);
+ } else {
+ for (uint64 i=0; i<tableSizeInEntries; i++) {
+ tmpPosition = countingTable[i];
+ countingTable[i] = begPosition;
+
+ _hashTable[i] = begPosition;
+
+ begPosition += tmpPosition;
+ }
+
+ // Set the last position in the hash, but we don't care about
+ // the temporary counting table.
+ //
+ _hashTable[tableSizeInEntries] = begPosition;
+ }
+
+
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 3) Build list of mers, placed into buckets
+ //
+ M = new merStream(new kMerBuilder(_merSizeInBases),
+ new seqStream(filename),
+ true, true);
+
+ while (M->nextMer()) {
+ if (_isForward)
+ insertMer(HASH(M->theFMer()), CHECK(M->theFMer()), 1, countingTable);
+
+ if (_isCanonical)
+ insertMer(HASH(M->theCMer()), CHECK(M->theCMer()), 1, countingTable);
+ }
+
+ delete M;
+
+ // Compress out the gaps we have from redundant kmers.
+
+ uint64 pos = 0;
+ uint64 frm = 0;
+ uint64 len = 0;
+
+ for (uint64 i=0; i<tableSizeInEntries; i++) {
+ frm = _hashTable[i];
+ len = countingTable[i] - _hashTable[i];
+
+ _hashTable[i] = pos;
+
+ for (uint64 j=0; j<len; j++) {
+ if (_counts)
+ _counts[pos] = _counts[frm];
+
+ _buckets[pos++] = _buckets[frm++];
+ }
+ }
+
+ if (beVerbose)
+ fprintf(stderr, "Compressed from "uint64FMT" to "uint64FMT" ("uint64FMT" bits)\n",
+ _hashTable[tableSizeInEntries], pos, logBaseTwo64(pos));
+
+ while (pos < _bucketsWords)
+ _buckets[pos++] = 0;
+
+ _hashTable[tableSizeInEntries] = pos;
+
+ // All done. Delete temporary stuff
+ //
+ delete [] countingTable;
+
+ // But if we horribly screwed up the estimate of tblBits, reset and recompute
+
+ if ((logBaseTwo64(pos) < tblBits) &&
+ (rebuilding == false)) {
+ rebuilding = true;
+
+ delete [] _hashTable;
+ delete [] _buckets;
+ delete [] _counts;
+
+ _hashTable = 0L;
+ _buckets = 0L;
+ _counts = 0L;
+
+ tblBits = logBaseTwo64(pos);
+
+ goto rebuild;
+ }
+
+ return(true);
+}
diff --git a/libkmer/existDB-create-from-meryl.C b/libkmer/existDB-create-from-meryl.C
new file mode 100644
index 0000000..b30db0c
--- /dev/null
+++ b/libkmer/existDB-create-from-meryl.C
@@ -0,0 +1,230 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include "existDB.H"
+#include "libmeryl.H"
+
+
+bool
+existDB::createFromMeryl(char const *prefix,
+ uint32 merSize,
+ uint32 lo,
+ uint32 hi,
+ uint32 flags) {
+
+ merylStreamReader *M = new merylStreamReader(prefix);
+
+ bool beVerbose = false;
+
+ _hashTable = 0L;
+ _buckets = 0L;
+ _counts = 0L;
+
+ _merSizeInBases = M->merSize();
+
+ if (merSize != _merSizeInBases) {
+ fprintf(stderr, "createFromMeryl()-- ERROR: requested merSize ("uint32FMT") is different than merSize in meryl database ("uint32FMT").\n",
+ merSize, _merSizeInBases);
+ exit(1);
+ }
+
+ // We can set this exactly, but not memory optimal (see meryl/estimate.C:optimalNumberOfBuckets()).
+ // Instead, we just blindly use whatever meryl used.
+ //
+ uint32 tblBits = M->prefixSize();
+
+ // But it is faster to reset to this. Might use 2x the memory.
+ //uint32 tblBits = logBaseTwo64(M->numberOfDistinctMers() + 1);
+
+ _shift1 = 2 * _merSizeInBases - tblBits;
+ _shift2 = _shift1 / 2;
+ _mask1 = uint64MASK(tblBits);
+ _mask2 = uint64MASK(_shift1);
+
+ _hshWidth = uint32ZERO;
+ _chkWidth = 2 * _merSizeInBases - tblBits;
+ _cntWidth = 16;
+
+ uint64 tableSizeInEntries = uint64ONE << tblBits;
+ uint64 numberOfMers = uint64ZERO;
+ uint64 *countingTable = new uint64 [tableSizeInEntries + 1];
+
+ if (beVerbose) {
+ fprintf(stderr, "createFromMeryl()-- tableSizeInEntries "uint64FMT"\n", tableSizeInEntries);
+ fprintf(stderr, "createFromMeryl()-- count range "uint32FMT"-"uint32FMT"\n", lo, hi);
+ }
+
+ for (uint64 i=tableSizeInEntries+1; i--; )
+ countingTable[i] = 0;
+
+ _isCanonical = flags & existDBcanonical;
+ _isForward = flags & existDBforward;
+
+ if (beVerbose) {
+ fprintf(stderr, "createFromMeryl()-- canonical %c\n", (_isCanonical) ? 'T' : 'F');
+ fprintf(stderr, "createFromMeryl()-- forward %c\n", (_isForward) ? 'T' : 'F');
+ }
+
+ assert(_isCanonical + _isForward == 1);
+
+ // 1) Count bucket sizes
+ // While we don't know the bucket sizes right now, but we do know
+ // how many buckets and how many mers.
+ //
+ // Because we could be inserting both forward and reverse, we can't
+ // really move the direction testing outside the loop, unless we
+ // want to do two iterations over M.
+ //
+ speedCounter *C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, beVerbose);
+
+ while (M->nextMer()) {
+ if ((lo <= M->theCount()) && (M->theCount() <= hi)) {
+ if (_isForward) {
+ countingTable[ HASH(M->theFMer()) ]++;
+ numberOfMers++;
+ }
+
+ if (_isCanonical) {
+ kMer r = M->theFMer();
+ r.reverseComplement();
+
+ if (M->theFMer() < r)
+ countingTable[ HASH(M->theFMer()) ]++;
+ else
+ countingTable[ HASH(r) ]++;
+ numberOfMers++;
+ }
+
+ C->tick();
+ }
+ }
+
+ if (beVerbose)
+ fprintf(stderr, "createFromMeryl()-- numberOfMers "uint64FMT"\n", numberOfMers);
+
+ delete C;
+ delete M;
+
+ if (_compressedHash) {
+ _hshWidth = 1;
+ while ((numberOfMers+1) > (uint64ONE << _hshWidth))
+ _hshWidth++;
+ }
+
+ if (beVerbose) {
+ fprintf(stderr, "existDB::createFromMeryl()-- Found "uint64FMT" mers between count of "uint32FMT" and "uint32FMT"\n",
+ numberOfMers, lo, hi);
+ }
+
+ // 2) Allocate hash table, mer storage buckets
+ //
+ _hashTableWords = tableSizeInEntries + 2;
+ if (_compressedHash)
+ _hashTableWords = _hashTableWords * _hshWidth / 64 + 1;
+
+ _bucketsWords = numberOfMers + 2;
+ if (_compressedBucket)
+ _bucketsWords = _bucketsWords * _chkWidth / 64 + 1;
+
+ _countsWords = numberOfMers + 2;
+ if (_compressedCounts)
+ _countsWords = _countsWords * _cntWidth / 64 + 1;
+
+ if (beVerbose) {
+ fprintf(stderr, "existDB::createFromMeryl()-- hashTable is "uint64FMT"MB\n", _hashTableWords >> 17);
+ fprintf(stderr, "existDB::createFromMeryl()-- buckets is "uint64FMT"MB\n", _bucketsWords >> 17);
+ if (flags & existDBcounts)
+ fprintf(stderr, "existDB::createFromMeryl()-- counts is "uint64FMT"MB\n", _countsWords >> 17);
+ }
+
+ _hashTable = new uint64 [_hashTableWords];
+ _buckets = new uint64 [_bucketsWords];
+ _countsWords = (flags & existDBcounts) ? _countsWords : 0;
+ _counts = (flags & existDBcounts) ? new uint64 [_countsWords] : 0L;
+
+ // These aren't strictly needed. _buckets is cleared as it is initialied. _hashTable
+ // is also cleared as it is initialized, but in the _compressedHash case, the last
+ // few words might be uninitialized. They're unused.
+
+ //memset(_hashTable, 0, sizeof(uint64) * _hashTableWords);
+ //memset(_buckets, 0, sizeof(uint64) * _bucketsWords); // buckets is cleared as it is built
+ //memset(_counts, 0, sizeof(uint64) * _countsWords);
+
+ _hashTable[_hashTableWords-1] = 0;
+ _hashTable[_hashTableWords-2] = 0;
+ _hashTable[_hashTableWords-3] = 0;
+ _hashTable[_hashTableWords-4] = 0;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // Make the hash table point to the start of the bucket, and reset
+ // the counting table -- we're going to use it to fill the buckets.
+ //
+ uint64 tmpPosition = 0;
+ uint64 begPosition = 0;
+ uint64 ptr = 0;
+
+ if (_compressedHash) {
+ for (uint64 i=0; i<tableSizeInEntries; i++) {
+ tmpPosition = countingTable[i];
+ countingTable[i] = begPosition;
+
+ setDecodedValue(_hashTable, ptr, _hshWidth, begPosition);
+ ptr += _hshWidth;
+
+ begPosition += tmpPosition;
+ }
+
+ setDecodedValue(_hashTable, ptr, _hshWidth, begPosition);
+ } else {
+ for (uint64 i=0; i<tableSizeInEntries; i++) {
+ tmpPosition = countingTable[i];
+ countingTable[i] = begPosition;
+
+ _hashTable[i] = begPosition;
+
+ begPosition += tmpPosition;
+ }
+
+ // Set the last position in the hash, but we don't care about
+ // the temporary counting table.
+ //
+ _hashTable[tableSizeInEntries] = begPosition;
+ }
+
+
+ ///////////////////////////////////////////////////////////////////////////////
+ //
+ // 3) Build list of mers, placed into buckets
+ //
+ M = new merylStreamReader(prefix);
+ C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, beVerbose);
+
+ while (M->nextMer()) {
+ if ((lo <= M->theCount()) && (M->theCount() <= hi)) {
+ if (_isForward)
+ insertMer(HASH(M->theFMer()), CHECK(M->theFMer()), M->theCount(), countingTable);
+
+ if (_isCanonical) {
+ kMer r = M->theFMer();
+ r.reverseComplement();
+
+ if (M->theFMer() < r)
+ insertMer(HASH(M->theFMer()), CHECK(M->theFMer()), M->theCount(), countingTable);
+ else
+ insertMer(HASH(r), CHECK(r), M->theCount(), countingTable);
+ numberOfMers++;
+ }
+
+
+ C->tick();
+ }
+ }
+
+ delete C;
+ delete M;
+ delete [] countingTable;
+
+ return(true);
+}
diff --git a/libkmer/existDB-create-from-sequence.C b/libkmer/existDB-create-from-sequence.C
new file mode 100644
index 0000000..ca2fc76
--- /dev/null
+++ b/libkmer/existDB-create-from-sequence.C
@@ -0,0 +1,271 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include "existDB.H"
+#include "bio++.H"
+#include "seqCache.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+bool
+existDB::createFromSequence(char const *sequence,
+ uint32 merSize,
+ uint32 flags) {
+
+ bool beVerbose = false;
+ bool rebuilding = false;
+
+ _hashTable = 0L;
+ _buckets = 0L;
+ _counts = 0L;
+
+ _merSizeInBases = merSize;
+
+ _searchForDupe = true;
+
+ if ((flags & existDBcompressHash) ||
+ (flags & existDBcompressBuckets) ||
+ (flags & existDBcompressCounts))
+ fprintf(stderr, "existDB::createFromSequence: compression not supported.\n"), exit(1);
+
+ // This (at =22) eats up 16MB, and should allow a lot of mers at big sizes. Unfortunately, we
+ // know nothing about how man mers are going to be in the input.
+ //
+ // Setting this too high drastically reduces performance, suspected because of cache misses.
+ // Setting this too low will also reduce performance, by increasing the search time in a bucket.
+ //
+ uint32 tblBits = logBaseTwo64(strlen(sequence));
+
+ rebuild:
+ _shift1 = 2 * _merSizeInBases - tblBits;
+ _shift2 = _shift1 / 2;
+ _mask1 = uint64MASK(tblBits);
+ _mask2 = uint64MASK(_shift1);
+
+ _hshWidth = uint32ZERO;
+ _chkWidth = 2 * merSize - tblBits;
+ _cntWidth = 16;
+
+ uint64 tableSizeInEntries = uint64ONE << tblBits;
+ uint64 numberOfMers = uint64ZERO;
+ uint64 *countingTable = new uint64 [tableSizeInEntries + 1];
+
+ for (uint64 i=tableSizeInEntries+1; i--; )
+ countingTable[i] = 0;
+
+ _isCanonical = flags & existDBcanonical;
+ _isForward = flags & existDBforward;
+
+ assert(_isCanonical + _isForward == 1);
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 1) Count bucket sizes
+ //
+ merStream *M = new merStream(new kMerBuilder(_merSizeInBases),
+ new seqStream(sequence, strlen(sequence)),
+ true, true);
+
+ while (M->nextMer()) {
+ if (_isForward) {
+ countingTable[ HASH(M->theFMer()) ]++;
+ numberOfMers++;
+ }
+
+ if (_isCanonical) {
+ countingTable[ HASH(M->theCMer()) ]++;
+ numberOfMers++;
+ }
+ }
+
+ delete M;
+
+#ifdef STATS
+ uint64 dist[32] = {0};
+ uint64 maxcnt = 0;
+ for (uint64 i=tableSizeInEntries+1; i--; ) {
+ if (countingTable[i] > maxcnt)
+ maxcnt = countingTable[i];
+
+ if (countingTable[i] < 32)
+ dist[countingTable[i]]++;
+ }
+
+ for(uint64 i=0; i<32; i++)
+ fprintf(stderr, "existDB::usage[%2d] = %d\n", i, dist[i]);
+ fprintf(stderr, "existDB::maxcnt = %d\n", maxcnt);
+#endif
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // Determine how many bits we need to hold the value
+ // numberOfMers.....then....
+ //
+ // This is numberOfMers+1 because we need to store the
+ // first position after the last mer. That is, if there are two
+ // mers, we will store that the first mer is at position 0, the
+ // second mer is at position 1, and the end of the second mer is at
+ // position 2.
+ //
+ if (_compressedHash) {
+ _hshWidth = 1;
+ while ((numberOfMers+1) > (uint64ONE << _hshWidth))
+ _hshWidth++;
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 2) Allocate a hash table and some mer storage buckets.
+ //
+ _hashTableWords = tableSizeInEntries + 2;
+ if (_compressedHash)
+ _hashTableWords = _hashTableWords * _hshWidth / 64 + 1;
+
+ _bucketsWords = numberOfMers + 2;
+ if (_compressedBucket)
+ _bucketsWords = _bucketsWords * _chkWidth / 64 + 1;
+
+ _countsWords = numberOfMers + 2;
+ if (_compressedCounts)
+ _countsWords = _countsWords * _cntWidth / 64 + 1;
+
+ if (beVerbose) {
+ fprintf(stderr, "existDB::createFromSequence()-- hashTable is "uint64FMT"MB\n", _hashTableWords >> 17);
+ fprintf(stderr, "existDB::createFromSequence()-- buckets is "uint64FMT"MB\n", _bucketsWords >> 17);
+ if (flags & existDBcounts)
+ fprintf(stderr, "existDB::createFromSequence()-- counts is "uint64FMT"MB\n", _countsWords >> 17);
+ }
+
+ _hashTable = new uint64 [_hashTableWords];
+ _buckets = new uint64 [_bucketsWords];
+ _countsWords = (flags & existDBcounts) ? _countsWords : 0;
+ _counts = (flags & existDBcounts) ? new uint64 [_countsWords] : 0L;
+
+ // These aren't strictly needed. _buckets is cleared as it is initialied. _hashTable
+ // is also cleared as it is initialized, but in the _compressedHash case, the last
+ // few words might be uninitialized. They're unused.
+
+ //memset(_hashTable, 0, sizeof(uint64) * _hashTableWords);
+ //memset(_buckets, 0, sizeof(uint64) * _bucketsWords); // buckets is cleared as it is built
+ //memset(_counts, 0, sizeof(uint64) * _countsWords);
+
+ _hashTable[_hashTableWords-1] = 0;
+ _hashTable[_hashTableWords-2] = 0;
+ _hashTable[_hashTableWords-3] = 0;
+ _hashTable[_hashTableWords-4] = 0;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // Make the hash table point to the start of the bucket, and reset
+ // the counting table -- we're going to use it to fill the buckets.
+ //
+ uint64 tmpPosition = 0;
+ uint64 begPosition = 0;
+ uint64 ptr = 0;
+
+ if (_compressedHash) {
+ for (uint64 i=0; i<tableSizeInEntries; i++) {
+ tmpPosition = countingTable[i];
+ countingTable[i] = begPosition;
+
+ setDecodedValue(_hashTable, ptr, _hshWidth, begPosition);
+ ptr += _hshWidth;
+
+ begPosition += tmpPosition;
+ }
+
+ setDecodedValue(_hashTable, ptr, _hshWidth, begPosition);
+ } else {
+ for (uint64 i=0; i<tableSizeInEntries; i++) {
+ tmpPosition = countingTable[i];
+ countingTable[i] = begPosition;
+
+ _hashTable[i] = begPosition;
+
+ begPosition += tmpPosition;
+ }
+
+ // Set the last position in the hash, but we don't care about
+ // the temporary counting table.
+ //
+ _hashTable[tableSizeInEntries] = begPosition;
+ }
+
+
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 3) Build list of mers, placed into buckets
+ //
+ M = new merStream(new kMerBuilder(_merSizeInBases),
+ new seqStream(sequence, strlen(sequence)),
+ true, true);
+
+ while (M->nextMer()) {
+ if (_isForward)
+ insertMer(HASH(M->theFMer()), CHECK(M->theFMer()), 1, countingTable);
+
+ if (_isCanonical)
+ insertMer(HASH(M->theCMer()), CHECK(M->theCMer()), 1, countingTable);
+ }
+
+ delete M;
+
+ // Compress out the gaps we have from redundant kmers.
+
+ uint64 pos = 0;
+ uint64 frm = 0;
+ uint64 len = 0;
+
+ for (uint64 i=0; i<tableSizeInEntries; i++) {
+ frm = _hashTable[i];
+ len = countingTable[i] - _hashTable[i];
+
+ _hashTable[i] = pos;
+
+ for (uint64 j=0; j<len; j++) {
+ if (_counts)
+ _counts[pos] = _counts[frm];
+
+ _buckets[pos++] = _buckets[frm++];
+ }
+ }
+
+ if (beVerbose)
+ fprintf(stderr, "Compressed from "uint64FMT" to "uint64FMT" ("uint64FMT" bits)\n",
+ _hashTable[tableSizeInEntries], pos, logBaseTwo64(pos));
+
+ while (pos < _bucketsWords)
+ _buckets[pos++] = 0;
+
+ _hashTable[tableSizeInEntries] = pos;
+
+ // All done. Delete temporary stuff
+ //
+ delete [] countingTable;
+
+ // But if we horribly screwed up the estimate of tblBits, reset and recompute
+
+ if ((logBaseTwo64(pos) < tblBits) &&
+ (rebuilding == false)) {
+ rebuilding = true;
+
+ delete [] _hashTable;
+ delete [] _buckets;
+ delete [] _counts;
+
+ _hashTable = 0L;
+ _buckets = 0L;
+ _counts = 0L;
+
+ tblBits = logBaseTwo64(pos);
+
+ goto rebuild;
+ }
+
+ return(true);
+}
diff --git a/libkmer/existDB-state.C b/libkmer/existDB-state.C
new file mode 100644
index 0000000..7c854c0
--- /dev/null
+++ b/libkmer/existDB-state.C
@@ -0,0 +1,205 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include "existDB.H"
+#include "bio++.H"
+
+
+const char magic[16] = { 'e', 'x', 'i', 's', 't', 'D', 'B', '2',
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' };
+
+
+void
+existDB::saveState(char const *filename) {
+ char cigam[16] = { 0 };
+
+ errno = 0;
+ FILE *F = fopen(filename, "wb");
+ if (errno) {
+ fprintf(stderr, "Can't open '%s' for writing\n%s\n", filename, strerror(errno));
+ exit(1);
+ }
+
+ strncpy(cigam, magic, 16);
+
+ if (_compressedHash)
+ cigam[8] = 'h';
+ if (_compressedBucket)
+ cigam[9] = 'b';
+ if (_compressedCounts)
+ cigam[10] = 'c';
+
+ if (_isForward)
+ cigam[11] = 'F';
+ if (_isCanonical)
+ cigam[11] = 'C';
+
+ fwrite(cigam, sizeof(char), 16, F);
+
+ fwrite(&_merSizeInBases, sizeof(uint32), 1, F);
+ fwrite(&_shift1, sizeof(uint32), 1, F);
+ fwrite(&_shift2, sizeof(uint32), 1, F);
+ fwrite(&_mask1, sizeof(uint64), 1, F);
+ fwrite(&_mask2, sizeof(uint64), 1, F);
+ fwrite(&_hshWidth, sizeof(uint32), 1, F); // only valid if _compressedHash
+ fwrite(&_chkWidth, sizeof(uint32), 1, F); // only valid if _compressedBucket
+ fwrite(&_cntWidth, sizeof(uint32), 1, F); // only valid if _compressedCounts
+
+ fwrite(&_hashTableWords, sizeof(uint64), 1, F);
+ fwrite(&_bucketsWords, sizeof(uint64), 1, F);
+ fwrite(&_countsWords, sizeof(uint64), 1, F);
+
+ fwrite(_hashTable, sizeof(uint64), _hashTableWords, F);
+ fwrite(_buckets, sizeof(uint64), _bucketsWords, F);
+ fwrite(_counts, sizeof(uint64), _countsWords, F);
+
+ fclose(F);
+
+ if (errno) {
+ fprintf(stderr, "existDB::saveState()-- Write failure.\n%s\n", strerror(errno));
+ exit(1);
+ }
+}
+
+
+
+bool
+existDB::loadState(char const *filename,
+ bool beNoisy,
+ bool loadData) {
+ char cigam[16];
+
+ errno = 0;
+ FILE *F = fopen(filename, "rb");
+ if (errno) {
+ //fprintf(stderr, "Can't open '%s' for reading pre-built existDB\n%s\n", strerror(errno));
+ return(false);
+ }
+
+ fread(cigam, sizeof(char), 16, F);
+
+ _compressedHash = false;
+ _compressedBucket = false;
+ _compressedCounts = false;
+ _isForward = false;
+ _isCanonical = false;
+
+ if (cigam[8] == 'h')
+ _compressedHash = true;
+ if (cigam[9] == 'b')
+ _compressedBucket = true;
+ if (cigam[10] == 'c')
+ _compressedCounts = true;
+
+ if (cigam[11] == 'F')
+ _isForward = true;
+ if (cigam[11] == 'C')
+ _isCanonical = true;
+
+ cigam[ 8] = ' ';
+ cigam[ 9] = ' ';
+ cigam[10] = ' ';
+ cigam[11] = ' ';
+
+ if (strncmp(magic, cigam, 16) != 0) {
+ if (beNoisy) {
+ fprintf(stderr, "existDB::loadState()-- Not an existDB binary file, maybe a sequence file?\n");
+ fprintf(stderr, "existDB::loadState()-- Read '%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c'\n",
+ cigam[0], cigam[1], cigam[2], cigam[3],
+ cigam[4], cigam[5], cigam[6], cigam[7],
+ cigam[8], cigam[9], cigam[10], cigam[11],
+ cigam[12], cigam[13], cigam[14], cigam[15]);
+ fprintf(stderr, "existDB::loadState()-- Expected '%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c'\n",
+ magic[0], magic[1], magic[2], magic[3],
+ magic[4], magic[5], magic[6], magic[7],
+ magic[8], magic[9], magic[10], magic[11],
+ magic[12], magic[13], magic[14], magic[15]);
+ }
+
+ fclose(F);
+ return(false);
+ }
+
+ fread(&_merSizeInBases, sizeof(uint32), 1, F);
+ fread(&_shift1, sizeof(uint32), 1, F);
+ fread(&_shift2, sizeof(uint32), 1, F);
+ fread(&_mask1, sizeof(uint64), 1, F);
+ fread(&_mask2, sizeof(uint64), 1, F);
+ fread(&_hshWidth, sizeof(uint32), 1, F); // only valid if _compressedHash
+ fread(&_chkWidth, sizeof(uint32), 1, F); // only valid if _compressedBucket
+ fread(&_cntWidth, sizeof(uint32), 1, F); // only valid if _compressedCounts
+
+ fread(&_hashTableWords, sizeof(uint64), 1, F);
+ fread(&_bucketsWords, sizeof(uint64), 1, F);
+ fread(&_countsWords, sizeof(uint64), 1, F);
+
+ _hashTable = 0L;
+ _buckets = 0L;
+ _counts = 0L;
+
+ if (loadData) {
+ _hashTable = new uint64 [_hashTableWords];
+ _buckets = new uint64 [_bucketsWords];
+
+ if (_countsWords > 0)
+ _counts = new uint64 [_countsWords];
+
+ fread(_hashTable, sizeof(uint64), _hashTableWords, F);
+ fread(_buckets, sizeof(uint64), _bucketsWords, F);
+
+ if (_countsWords > 0)
+ fread(_counts, sizeof(uint64), _countsWords, F);
+ }
+
+ fclose(F);
+
+ if (errno) {
+ fprintf(stderr, "existDB::loadState()-- Read failure.\n%s\n", strerror(errno));
+ exit(1);
+ }
+
+ return(true);
+}
+
+
+void
+existDB::printState(FILE *stream) {
+
+ fprintf(stream, "merSizeInBases: "uint32FMT"\n", _merSizeInBases);
+ fprintf(stream, "tableBits "uint32FMT"\n", 2 * _merSizeInBases - _shift1);
+ fprintf(stream, "-----------------\n");
+ fprintf(stream, "_hashTableWords "uint64FMT" ("uint64FMT" KB)\n", _hashTableWords, _hashTableWords >> 7);
+ fprintf(stream, "_bucketsWords "uint64FMT" ("uint64FMT" KB)\n", _bucketsWords, _bucketsWords >> 7);
+ fprintf(stream, "_countsWords "uint64FMT" ("uint64FMT" KB)\n", _countsWords, _countsWords >> 7);
+ fprintf(stream, "-----------------\n");
+ fprintf(stream, "_shift1: "uint32FMT"\n", _shift1);
+ fprintf(stream, "_shift2 "uint32FMT"\n", _shift2);
+ fprintf(stream, "_mask1 "uint64HEX"\n", _mask1);
+ fprintf(stream, "_mask2 "uint64HEX"\n", _mask2);
+
+ if (_compressedHash) {
+ fprintf(stream, "_compressedHash true\n");
+ fprintf(stream, "_hshWidth "uint32FMT"\n", _hshWidth);
+ } else {
+ fprintf(stream, "_compressedHash false\n");
+ fprintf(stream, "_hshWidth undefined\n");
+ }
+
+ if (_compressedBucket) {
+ fprintf(stream, "_compressedBucket true\n");
+ fprintf(stream, "_chkWidth "uint32FMT"\n", _chkWidth);
+ } else {
+ fprintf(stream, "_compressedBucket false\n");
+ fprintf(stream, "_chkWidth undefined\n");
+ }
+
+ if (_compressedCounts) {
+ fprintf(stream, "_compressedCount true\n");
+ fprintf(stream, "_cntWidth "uint32FMT"\n", _cntWidth);
+ } else {
+ fprintf(stream, "_compressedCount false\n");
+ fprintf(stream, "_cntWidth undefined\n");
+ }
+}
+
diff --git a/libkmer/existDB.C b/libkmer/existDB.C
new file mode 100644
index 0000000..0173a4a
--- /dev/null
+++ b/libkmer/existDB.C
@@ -0,0 +1,182 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include "existDB.H"
+#include "bio++.H"
+
+
+existDB::existDB(char const *filename,
+ bool loadData) {
+ clear();
+
+ _compressedHash = false;
+ _compressedBucket = false;
+
+ if (loadState(filename, true, loadData) == false) {
+ fprintf(stderr, "existDB::existDB()-- Tried to read state from '%s', but failed.\n", filename);
+ exit(1);
+ }
+}
+
+
+existDB::existDB(char const *filename,
+ uint32 merSize,
+ existDBflags flags,
+ uint32 lo,
+ uint32 hi) {
+ clear();
+
+ _compressedHash = flags & existDBcompressHash;
+ _compressedBucket = flags & existDBcompressBuckets;
+ _compressedCounts = flags & existDBcompressCounts;
+
+ _searchForDupe = false;
+
+ // Try to read state from the filename. If successful, make sure
+ // that the merSize is correct.
+ //
+ if (loadState(filename)) {
+ bool fail = false;
+
+ if (_merSizeInBases != merSize) {
+ fprintf(stderr, "existDB::existDB()-- Read state from '%s', but got different mer sizes\n", filename);
+ fprintf(stderr, "existDB::existDB()-- Got "uint32FMT", expected "uint32FMT"\n", _merSizeInBases, merSize);
+ fail = true;
+ }
+
+ if (fail)
+ exit(1);
+
+ return;
+ }
+
+ // If no direction flags are set, set the default direction of
+ // forward. Stupid precedence rules.
+ //
+ if ((flags & (existDBcanonical | existDBforward)) == uint32ZERO)
+ flags |= existDBforward;
+
+ // If we can open 'filename' for reading, then we assume the file
+ // is a multi-fasta, and we build an existDB/
+ //
+ // Otherwise, we assume that 'filename' is really the prefix for a
+ // meryl database.
+
+
+ if (fileExists(filename))
+ createFromFastA(filename, merSize, flags);
+ else
+ createFromMeryl(filename, merSize, lo, hi, flags);
+}
+
+
+existDB::existDB(char const *sequence,
+ uint32 merSize,
+ existDBflags flags) {
+ clear();
+
+ _compressedHash = flags & existDBcompressHash;
+ _compressedBucket = flags & existDBcompressBuckets;
+ _compressedCounts = flags & existDBcompressCounts;
+
+ if ((flags & (existDBcanonical | existDBforward)) == uint32ZERO)
+ flags |= existDBforward;
+
+ createFromSequence(sequence, merSize, flags);
+}
+
+
+existDB::~existDB() {
+ delete [] _hashTable;
+ delete [] _buckets;
+ delete [] _counts;
+}
+
+
+
+
+
+bool
+existDB::exists(uint64 mer) {
+ uint64 c, h, st, ed;
+
+ if (_compressedHash) {
+ h = HASH(mer) * _hshWidth;
+ st = getDecodedValue(_hashTable, h, _hshWidth);
+ ed = getDecodedValue(_hashTable, h + _hshWidth, _hshWidth);
+ } else {
+ h = HASH(mer);
+ st = _hashTable[h];
+ ed = _hashTable[h+1];
+ }
+
+ if (st == ed)
+ return(false);
+
+ c = CHECK(mer);
+
+ if (_compressedBucket) {
+ st *= _chkWidth;
+ ed *= _chkWidth;
+
+ for (; st<ed; st += _chkWidth) {
+ if (getDecodedValue(_buckets, st, _chkWidth) == c)
+ return(true);
+ }
+ } else {
+ for (; st<ed; st++) {
+ if (_buckets[st] == c)
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+
+uint64
+existDB::count(uint64 mer) {
+ uint64 c, h, st, ed;
+
+ if (_counts == 0L)
+ return(0);
+
+ if (_compressedHash) {
+ h = HASH(mer) * _hshWidth;
+ st = getDecodedValue(_hashTable, h, _hshWidth);
+ ed = getDecodedValue(_hashTable, h + _hshWidth, _hshWidth);
+ } else {
+ h = HASH(mer);
+ st = _hashTable[h];
+ ed = _hashTable[h+1];
+ }
+
+ if (st == ed)
+ return(0);
+
+ c = CHECK(mer);
+
+ if (_compressedBucket) {
+ st *= _chkWidth;
+ ed *= _chkWidth;
+
+ for (; st<ed; st += _chkWidth) {
+ if (getDecodedValue(_buckets, st, _chkWidth) == c)
+ goto returncount;
+ }
+ } else {
+ for (; st<ed; st++) {
+ if (_buckets[st] == c)
+ goto returncount;
+ }
+ }
+
+ return(0);
+
+ returncount:
+ if (_compressedCounts)
+ return(getDecodedValue(_counts, st * _cntWidth, _cntWidth));
+ else
+ return(_counts[st]);
+}
diff --git a/libkmer/existDB.H b/libkmer/existDB.H
new file mode 100644
index 0000000..7577e81
--- /dev/null
+++ b/libkmer/existDB.H
@@ -0,0 +1,151 @@
+#ifndef EXISTDB_H
+#define EXISTDB_H
+
+// Used by wgs-assembler, to determine if a rather serious bug was patched.
+#define EXISTDB_H_VERSION 1960
+
+#include "bio++.H"
+
+// Takes as input a list of mers (in a file) and builds a searchable
+// structure listing those mers. Duplicate mers are not removed and
+// will be stored multiple times.
+//
+// Using a compressed hash is allowed, but somewhat useless -- it is
+// really slow and doesn't save that much.
+//
+// If existDBcanonical is requested, this will store only the
+// canonical mer. It is up to the client to be sure that is
+// appropriate! See positionDB.H for more.
+
+//#define STATS
+
+typedef uint32 existDBflags;
+const existDBflags existDBnoFlags = 0x0000;
+const existDBflags existDBcompressHash = 0x0001;
+const existDBflags existDBcompressBuckets = 0x0002;
+const existDBflags existDBcompressCounts = 0x0004;
+const existDBflags existDBcanonical = 0x0008;
+const existDBflags existDBforward = 0x0010;
+const existDBflags existDBcounts = 0x0020;
+
+class existDB {
+public:
+
+ // Read state from an existDB file
+ existDB(char const *filename,
+ bool loadData=true);
+
+ // Load mers from an existing existDB file, a fastafile, or a meryl database
+ existDB(char const *filename,
+ uint32 merSize,
+ existDBflags flags,
+ uint32 lo,
+ uint32 hi);
+
+ // Load mers from a character string
+ existDB(char const *sequence,
+ uint32 merSize,
+ existDBflags flags);
+
+ ~existDB();
+
+ void saveState(char const *filename);
+
+ void printState(FILE *stream);
+
+ bool isForward(void) { return(_isForward); };
+ bool isCanonical(void) { return(_isCanonical); };
+
+ bool exists(uint64 mer);
+ uint64 count(uint64 mer);
+
+private:
+ bool loadState(char const *filename, bool beNoisy=false, bool loadData=true);
+ bool createFromFastA(char const *filename,
+ uint32 merSize,
+ uint32 flags);
+ bool createFromMeryl(char const *filename,
+ uint32 merSize,
+ uint32 lo,
+ uint32 hi,
+ uint32 flags);
+ bool createFromSequence(char const *sequence,
+ uint32 merSize,
+ uint32 flags);
+
+ uint64 HASH(uint64 k) {
+ return(((k >> _shift1) ^ (k >> _shift2) ^ k) & _mask1);
+ };
+
+ uint64 CHECK(uint64 k) {
+ return(k & _mask2);
+ };
+
+ void insertMer(uint64 hsh, uint64 chk, uint64 cnt, uint64 *countingTable) {
+
+ // If the mer is already here, just update the count. This only
+ // works if not _compressedBucket, and only makes sense for loading from
+ // fasta or sequence.
+
+ if ((_compressedBucket == false) &&
+ (_searchForDupe)) {
+ uint64 st = _hashTable[hsh];
+ uint64 ed = countingTable[hsh];
+
+ for (; st<ed; st++) {
+ if (_buckets[st] == chk) {
+ if (_counts)
+ _counts[st] += cnt;
+ return;
+ }
+ }
+ }
+
+ if (_compressedBucket)
+ setDecodedValue(_buckets, countingTable[hsh] * _chkWidth, _chkWidth, chk);
+ else
+ _buckets[countingTable[hsh]] = chk;
+
+ if (_counts) {
+ if (_compressedCounts) {
+ setDecodedValue(_counts, countingTable[hsh] * _cntWidth, _cntWidth, cnt);
+ } else {
+ _counts[countingTable[hsh]] = cnt;
+ }
+ }
+
+ countingTable[hsh]++;
+ };
+
+ bool _compressedHash;
+ bool _compressedBucket;
+ bool _compressedCounts;
+ bool _isForward;
+ bool _isCanonical;
+
+ bool _searchForDupe;
+
+ uint32 _merSizeInBases;
+
+ uint32 _shift1;
+ uint32 _shift2;
+ uint64 _mask1;
+ uint64 _mask2;
+
+ uint32 _hshWidth; // Only for the compressed hash
+ uint32 _chkWidth; // Only for the compressed bucket
+ uint32 _cntWidth; // Only for the compressed counts
+
+ uint64 _hashTableWords;
+ uint64 _bucketsWords;
+ uint64 _countsWords;
+
+ uint64 *_hashTable;
+ uint64 *_buckets;
+ uint64 *_counts;
+
+ void clear(void) {
+ };
+};
+
+#endif // EXISTDB_H
diff --git a/libkmer/kmer-mask.C b/libkmer/kmer-mask.C
new file mode 100644
index 0000000..ed4b463
--- /dev/null
+++ b/libkmer/kmer-mask.C
@@ -0,0 +1,716 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util++.H"
+#include "bio++.H"
+#include "libmeryl.H"
+#include "existDB.H"
+
+#include "seqCache.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+#include "sweatShop.H"
+
+
+
+
+class fastqRecord {
+public:
+ fastqRecord(uint32 ml) {
+ maxLength = ml;
+ alloc = new char [maxLength * 8];
+
+ a2 = alloc + 0 * maxLength;
+ af = alloc + 1 * maxLength;
+ am = alloc + 2 * maxLength;
+ a4 = alloc + 3 * maxLength;
+
+ a1[0] = 0;
+ a2[0] = 0;
+ af[0] = 0;
+ am[0] = 0;
+ a3[0] = 0;
+ a4[0] = 0;
+
+ aLength = 0;
+ aRetained = 0.0;
+ aLabel = 0;
+
+ b2 = alloc + 4 * maxLength;
+ bf = alloc + 5 * maxLength;
+ bm = alloc + 6 * maxLength;
+ b4 = alloc + 7 * maxLength;
+
+ b1[0] = 0;
+ b2[0] = 0;
+ bf[0] = 0;
+ bm[0] = 0;
+ b3[0] = 0;
+ b4[0] = 0;
+
+ bLength = 0;
+ bRetained = 0.0;
+ bLabel = 0;
+ };
+
+ ~fastqRecord() {
+ delete [] alloc;
+ };
+
+
+ bool load(FILE *FASTQ1, FILE *FASTQ2) {
+ bool tooShort = false;
+
+ a2[maxLength - 2] = 0;
+ b2[maxLength - 2] = 0;
+
+ if (FASTQ1) {
+ fgets(a1, 1024, FASTQ1); chomp(a1);
+ fgets(a2, maxLength, FASTQ1); chomp(a2);
+ fgets(a3, 1024, FASTQ1); chomp(a3);
+ fgets(a4, maxLength, FASTQ1); chomp(a4);
+
+ aLength = strlen(a2);
+ aRetained = 0.0;
+ aLabel = 0;
+
+ if (a2[maxLength - 2] != 0)
+ tooShort = true;
+ }
+
+ if (FASTQ2) {
+ fgets(b1, 1024, FASTQ2); chomp(b1);
+ fgets(b2, maxLength, FASTQ2); chomp(b2);
+ fgets(b3, 1024, FASTQ2); chomp(b3);
+ fgets(b4, maxLength, FASTQ2); chomp(b4);
+
+ bLength = strlen(b2);
+ bRetained = 0.0;
+ bLabel = 0;
+
+ if (b2[maxLength - 2] != 0)
+ tooShort = true;
+ }
+
+ if (tooShort) {
+ fprintf(stderr, "ERROR: -l too small for reads:\n");
+ fprintf(stderr, " a = '%s'\n", a1);
+ fprintf(stderr, " b = '%s'\n", b1);
+ exit(1);
+ }
+
+ return(!feof(FASTQ1));
+ };
+
+
+ void write(FILE *FASTQ1, FILE *FASTQ2) {
+
+ if (FASTQ1)
+ fprintf(FASTQ1, "%s fractionRetained=%.3f\n%s\n%s\n%s\n", a1, aRetained, am, a3, a4);
+
+ if (FASTQ2)
+ fprintf(FASTQ2, "%s fractionRetained=%.3f\n%s\n%s\n%s\n", b1, bRetained, bm, b3, b4);
+ };
+
+
+
+public:
+ uint32 maxLength;
+ char *alloc;
+
+ char a1[1024];
+ char *a2;
+ char *af;
+ char *am;
+ char a3[1024];
+ char *a4;
+
+ uint32 aLength;
+ double aRetained;
+ uint32 aLabel;
+
+ char b1[1024];
+ char *b2;
+ char *bf;
+ char *bm;
+ char b3[1024];
+ char *b4;
+
+ uint32 bLength;
+ double bRetained;
+ uint32 bLabel;
+};
+
+
+
+
+class maskGlobal {
+public:
+ maskGlobal() {
+ merName = NULL;
+
+ seq1Name = NULL;
+ seq2Name = NULL;
+
+ outPrefix = NULL;
+
+ merSize = 0;
+ maxLength = 512;
+
+ existName = NULL;
+ minSize = 0;
+ extend = 0;
+ keepNovel = false;
+ keepConfirmed = false;
+
+ demote = false;
+ promote = false;
+ discard = true;
+
+ lowThreshold = 1. / 3.;
+ highThreshold = 2. / 3.;
+
+ for (uint32 ii=0; ii<1001; ii++)
+ scoreHistogram[ii] = 0;
+
+ for (uint32 ii=0; ii<4; ii++)
+ for (uint32 jj=0; jj<4; jj++)
+ thresholdCounts[ii][jj] = 0.0;
+
+ outputHistogram = NULL;
+
+ exist = NULL;
+
+ FASTQ1 = NULL;
+ FASTQ1pipe = false;
+
+ FASTQ2 = NULL;
+ FASTQ2pipe = false;
+
+ OUTPUT1[0] = NULL;
+ OUTPUT1[1] = NULL;
+ OUTPUT1[2] = NULL;
+
+ OUTPUT2[0] = NULL;
+ OUTPUT2[1] = NULL;
+ OUTPUT2[2] = NULL;
+ };
+
+ ~maskGlobal() {
+ };
+
+
+public:
+ char *merName;
+
+ char *seq1Name;
+ char *seq2Name;
+
+ char *outPrefix;
+
+ uint32 merSize;
+ uint32 maxLength;
+
+ char *existName;
+ uint32 minSize;
+ uint32 extend;
+ bool keepNovel;
+ bool keepConfirmed;
+
+ bool demote;
+ bool promote;
+ bool discard;
+
+ double lowThreshold;
+ double highThreshold;
+
+ uint32 scoreHistogram[1001];
+ uint64 thresholdCounts[4][4];
+
+ char *outputHistogram;
+
+ existDB *exist;
+
+ FILE *FASTQ1;
+ bool FASTQ1pipe;
+
+ FILE *FASTQ2;
+ bool FASTQ2pipe;
+
+ FILE *OUTPUT1[4];
+ FILE *OUTPUT2[4];
+};
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+// Masks mers present in the database from the input sequences. Chains together
+// across small bits of missing mer.
+
+void
+printBits(char *S, uint32 Slen, char *found, char *display, const char *label) {
+ for (uint32 i=0; i<Slen; i++)
+ display[i] = (found[i]) ? '1' : '0';
+
+ display[Slen] = 0;
+
+ fprintf(stdout, "%s\n%s\n", label, display);
+}
+
+
+
+
+// Scan the read for kmers that exist in the DB. Set a bit for each kmer that exists.
+void
+buildMask(char *S, uint32 Slen, char *found, bool keepNovel, existDB *exist, uint32 merSize) {
+ merStream MS(new kMerBuilder(merSize),
+ new seqStream(S, Slen),
+ true, true);
+
+ for (uint32 i=0; i<Slen; i++)
+ found[i] = false;
+
+ while (MS.nextMer())
+ if (exist->exists(MS.theFMer()) || exist->exists(MS.theRMer()))
+ found[MS.thePositionInSequence()] = true;
+}
+
+
+
+
+// Searched for isolated 'true' bits, and removes them. Isolated means fewer than minSize true
+// bits are adjacent.
+//
+void
+removeIsolatedMers(char *S, uint32 Slen, char *found, uint32 minSize) {
+ uint32 bgn = 0;
+ uint32 end = 0;
+ bool inRun = false;
+
+ for (uint32 ii=0; ii<Slen; ii++) {
+
+ // Start of a run of 'true'.
+ if ((found[ii] == true) && (inRun == false)) {
+ bgn = ii;
+ end = ii;
+ inRun = true;
+ }
+
+ // End of run of 'true'. If small, destroy it.
+ if ((found[ii] == false) && (inRun == true)) {
+ end = ii;
+
+ if (end - bgn < minSize)
+ for (uint32 jj=bgn; jj<end; jj++)
+ found[jj] = false;
+
+ inRun = false;
+ }
+ }
+}
+
+
+
+// Convert the mer-start-based mask to a base-covering mask, allowing an extra uncovered
+// 'extension' bases in between two blocks to join.
+void
+convertToBases(char *S, uint32 Slen, char *found, uint32 merSize, uint32 extend) {
+ uint32 isMasking = 0;
+
+ for (uint32 ii=0; ii<Slen; ii++) {
+
+ if (found[ii])
+ isMasking = merSize;
+
+ // If the last mer we've found, see if we can extend over bases.
+ if ((isMasking == 1) &&
+ (extend > 0)) {
+ for (uint32 jj=ii; (jj<Slen) && (jj <= ii + extend + 1); jj++) {
+ if (found[jj] == true)
+ isMasking = jj - ii + 2;
+ }
+ }
+
+ if (isMasking > 0) {
+ found[ii] = true;
+ isMasking--;
+ }
+ }
+}
+
+
+
+// Assumes the found[] array represents base-based masking.
+// Returns the fraction of the sequence that is not masked.
+double
+maskSequence(char *S, uint32 Slen, char *found, bool keepNovel, char *display) {
+ uint32 saved = 0;
+
+ for (uint32 ii=0; ii<Slen; ii++) {
+ if (found[ii] == keepNovel) {
+ display[ii] = 'n';
+ } else {
+ display[ii] = S[ii];
+ saved++;
+ }
+ }
+
+ display[Slen] = 0;
+
+ return((double)saved / Slen);
+}
+
+
+
+
+
+
+
+
+void *
+fastqLoader(void *G) {
+ maskGlobal *g = (maskGlobal *)G;
+ fastqRecord *s = new fastqRecord(g->maxLength);
+
+ if (s->load(g->FASTQ1, g->FASTQ2) == false) {
+ delete s;
+ s = NULL;
+ }
+
+ return(s);
+}
+
+
+void
+maskWorker(void *G, void *T, void *S) {
+ maskGlobal *g = (maskGlobal *)G;
+ //maskThread *t = (maskThread *)T;
+ fastqRecord *s = (fastqRecord *)S;
+
+ buildMask(s->a2, s->aLength, s->af, g->keepNovel, g->exist, g->merSize);
+ buildMask(s->b2, s->bLength, s->bf, g->keepNovel, g->exist, g->merSize);
+ //printBits(S, found, display , "INITIAL");
+
+ removeIsolatedMers(s->a2, s->aLength, s->af, g->minSize);
+ removeIsolatedMers(s->b2, s->bLength, s->bf, g->minSize);
+ //printBits(S, found, display, "ISOLATED REMOVAL");
+
+ convertToBases(s->a2, s->aLength, s->af, g->merSize, g->extend);
+ convertToBases(s->b2, s->bLength, s->bf, g->merSize, g->extend);
+ //printBits(S, found, display, "BASE COVERAGE");
+
+ s->aRetained = maskSequence(s->a2, s->aLength, s->af, g->keepNovel, s->am);
+ s->bRetained = maskSequence(s->b2, s->bLength, s->bf, g->keepNovel, s->bm);
+
+ s->aLabel = (s->aRetained < g->lowThreshold) ? 0 : ((s->aRetained < g->highThreshold) ? 1 : 2);
+ s->bLabel = (s->bRetained < g->lowThreshold) ? 0 : ((s->bRetained < g->highThreshold) ? 1 : 2);
+
+ if ((s->aLabel != s->bLabel) && (g->demote)) {
+ s->aLabel = MIN(s->aLabel, s->bLabel);
+ s->bLabel = MIN(s->aLabel, s->bLabel);
+ }
+
+ if ((s->aLabel != s->bLabel) && (g->promote)) {
+ s->aLabel = MAX(s->aLabel, s->bLabel);
+ s->bLabel = MAX(s->aLabel, s->bLabel);
+ }
+
+ if ((s->aLabel != s->bLabel) && (g->discard)) {
+ s->aLabel = 3;
+ s->bLabel = 3;
+ }
+}
+
+
+void
+fastqWriter(void *G, void *S) {
+ maskGlobal *g = (maskGlobal *)G;
+ fastqRecord *s = (fastqRecord *)S;
+
+ s->write(g->OUTPUT1[s->aLabel], g->OUTPUT2[s->bLabel]);
+
+ g->thresholdCounts[s->aLabel][s->bLabel]++;
+
+ g->scoreHistogram[(uint32)(1000 * s->aRetained)]++;
+ g->scoreHistogram[(uint32)(1000 * s->bRetained)]++;
+
+ delete s;
+}
+
+
+
+
+
+
+FILE *
+openInput(char *filename, bool &P) {
+ char C[2 * FILENAME_MAX];
+ FILE *F = NULL;
+ int32 L = strlen(filename);
+
+ if ((L > 6) && (strcmp(filename + L - 6, ".fastq") == 0)) {
+ F = fopen(filename, "r");
+ P = false;
+ }
+
+ if ((L > 3) && (strcmp(filename + L - 3, ".gz") == 0)) {
+ sprintf(C, "gzip -dc %s", filename);
+ F = popen(C, "r");
+ P = true;
+ }
+
+ if ((L > 4) && (strcmp(filename + L - 4, ".bz2") == 0)) {
+ sprintf(C, "bzip2 -dc %s", filename);
+ F = popen(C, "r");
+ P = true;
+ }
+
+ if ((L > 3) && (strcmp(filename + L - 3, ".xz") == 0)) {
+ sprintf(C, "xz -dc %s", filename);
+ F = popen(C, "r");
+ P = true;
+ }
+
+ return(F);
+}
+
+void
+closeInput(FILE *F, char *filename, bool P) {
+ if (F)
+ if (P)
+ pclose(F);
+ else
+ fclose(F);
+}
+
+
+
+FILE *
+openOutput(char *prefix, const char *extension) {
+ char N[FILENAME_MAX];
+ FILE *F = NULL;
+
+ sprintf(N, "%s.%s.fastq", prefix, extension);
+ F = fopen(N, "w");
+ if (errno)
+ fprintf(stderr, "ERROR: failed to open '%s': %s\n", N, strerror(errno)), exit(1);
+
+ return(F);
+}
+
+void
+closeOutput(FILE *F, char *prefix, const char *extension) {
+ if (F)
+ fclose(F);
+}
+
+
+
+int
+main(int argc, char **argv) {
+ maskGlobal *g = new maskGlobal();
+
+ uint32 numWorkers = 1;
+ bool beVerbose = false;
+
+ int32 arg=1;
+ int32 err=0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-mdb") == 0) {
+ g->merName = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-ms") == 0) {
+ g->merSize = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-l") == 0) {
+ g->maxLength = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-edb") == 0) {
+ g->existName = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-1") == 0) {
+ g->seq1Name = argv[++arg];
+ } else if (strcmp(argv[arg], "-2") == 0) {
+ g->seq2Name = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-o") == 0) {
+ g->outPrefix = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-m") == 0) {
+ g->minSize = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-e") == 0) {
+ g->extend = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-t") == 0) {
+ numWorkers = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-v") == 0) {
+ beVerbose = true;
+
+ } else if (strcmp(argv[arg], "-novel") == 0) {
+ g->keepNovel = true;
+ } else if (strcmp(argv[arg], "-confirmed") == 0) {
+ g->keepConfirmed = true;
+
+ } else if (strcmp(argv[arg], "-demote") == 0) {
+ g->demote = true;
+ } else if (strcmp(argv[arg], "-promote") == 0) {
+ g->promote = true;
+ } else if (strcmp(argv[arg], "-discard") == 0) {
+ g->discard = true;
+
+ } else if (strncmp(argv[arg], "-lowthreshold", 3) == 0) {
+ g->lowThreshold = atof(argv[++arg]);
+ } else if (strncmp(argv[arg], "-highthreshold", 3) == 0) {
+ g->highThreshold = atof(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-h") == 0) {
+ g->outputHistogram = argv[++arg];
+ //} else if (strcmp(argv[arg], "-o") == 0) {
+ // outputSequence = atoi(argv[++arg]);
+
+ } else {
+ err++;
+ }
+
+ arg++;
+ }
+ if ((g->keepNovel == false) && (g->keepConfirmed == false))
+ err++;
+ if (err) {
+ fprintf(stderr, "usage: %s [-novel | -confirmed] ...\n", argv[0]);
+ fprintf(stderr, " -mdb mer-database load masking kmers from meryl 'mer-database'\n");
+ fprintf(stderr, " -ms mer-size \n");
+ fprintf(stderr, " -edb exist-database save masking kmers to 'exist-database' for faster restarts\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -1 in.1.fastq input reads - fastq, fastq.gz, fastq.bz2 or fastq.xz\n");
+ fprintf(stderr, " -2 in.2.fastq - (optional, but if not present, messes up the output classification)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -o out output reads:\n");
+ fprintf(stderr, " out.fullymasked.[12].fastq - reads with below 'lowthreshold' bases retained\n");
+ fprintf(stderr, " out.partiallymasked.[12].fastq - reads in between\n");
+ fprintf(stderr, " out.retained.[12].fastq - reads with more than 'hightreshold' bases retained\n");
+ fprintf(stderr, " out.discarded.[12].fastq - reads with conflicting status\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -m min-size ignore database hits below this many consecutive kmers (%d)\n", g->minSize);
+ fprintf(stderr, " -e extend-size extend database hits across this many missing kmers (%d)\n", g->extend);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -novel RETAIN novel sequence not present in the database\n");
+ fprintf(stderr, " -confirmed RETAIN confirmed sequence present in the database\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -promote promote the lesser RETAINED read to the status of the more RETAINED read\n");
+ fprintf(stderr, " read1=fullymasked and read2=partiallymasked -> both are partiallymasked\n");
+ fprintf(stderr, " -demote demote the more RETAINED read to the status of the lesser RETAINED read\n");
+ fprintf(stderr, " read1=fullymasked and read2=partiallymasked -> both are fullymasked\n");
+ fprintf(stderr, " -discard discard pairs with conflicting status (DEFAULT)\n");
+ fprintf(stderr, " read1=fullymasked and read2=partiallymasked -> both are discarded\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "stats on stderr, number of sequences with amount RETAINED:\n");
+ fprintf(stderr, " -lowthreshold t (%.4f)\n", g->lowThreshold);
+ fprintf(stderr, " -highthreshold t (%.4f)\n", g->highThreshold);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -h histogram write a histogram of the amount of sequence RETAINED\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -t t use 't' compute threads\n");
+ fprintf(stderr, " -v show progress\n");
+
+ if ((g->keepNovel == false) && (g->keepConfirmed == false))
+ fprintf(stderr, "ERROR: exactly one of -novel and -confirmed must be supplied.\n");
+
+ exit(1);
+ }
+
+ // Open inputs
+
+ g->FASTQ1 = openInput(g->seq1Name, g->FASTQ1pipe);
+ g->FASTQ2 = openInput(g->seq2Name, g->FASTQ2pipe);
+
+ g->OUTPUT1[0] = openOutput(g->outPrefix, "fullymasked.1");
+ g->OUTPUT1[1] = openOutput(g->outPrefix, "partiallymasked.1");
+ g->OUTPUT1[2] = openOutput(g->outPrefix, "retained.1");
+ g->OUTPUT1[3] = openOutput(g->outPrefix, "discarded.1");
+
+ g->OUTPUT2[0] = openOutput(g->outPrefix, "fullymasked.2");
+ g->OUTPUT2[1] = openOutput(g->outPrefix, "partiallymasked.2");
+ g->OUTPUT2[2] = openOutput(g->outPrefix, "retained.2");
+ g->OUTPUT2[3] = openOutput(g->outPrefix, "discarded.2");
+
+
+ // Load data
+
+ if ((g->existName != NULL) && (fileExists(g->existName))) {
+ if (beVerbose)
+ fprintf(stderr, "Load existDB existName='%s'.\n", g->existName);
+ g->exist = new existDB(g->existName);
+
+ } else {
+ if (beVerbose)
+ fprintf(stderr, "Build existDB from merName='%s'.\n", g->merName);
+ g->exist = new existDB(g->merName, g->merSize, existDBnoFlags, 0, ~uint32ZERO);
+
+ if (g->existName != NULL) {
+ if (beVerbose)
+ fprintf(stderr, "Save existDB into existName='%s'.\n", g->existName);
+ g->exist->saveState(g->existName);
+ }
+ }
+
+ // Process!
+
+ sweatShop *ss = new sweatShop(fastqLoader, maskWorker, fastqWriter);
+
+ ss->setNumberOfWorkers(numWorkers);
+
+ ss->setWorkerBatchSize(1024);
+
+ ss->setLoaderQueueSize(numWorkers * 81920);
+ ss->setWriterQueueSize(numWorkers * 81920);
+
+ ss->run(g, beVerbose);
+
+ closeInput(g->FASTQ1, g->seq1Name, g->FASTQ1pipe);
+ closeInput(g->FASTQ2, g->seq1Name, g->FASTQ2pipe);
+
+ closeOutput(g->OUTPUT1[0], g->outPrefix, "fulymasked.1");
+ closeOutput(g->OUTPUT1[1], g->outPrefix, "partiallymasked.1");
+ closeOutput(g->OUTPUT1[2], g->outPrefix, "retained.1");
+ closeOutput(g->OUTPUT1[3], g->outPrefix, "discarded.1");
+
+ closeOutput(g->OUTPUT2[0], g->outPrefix, "fulymasked.2");
+ closeOutput(g->OUTPUT2[1], g->outPrefix, "partiallymasked.2");
+ closeOutput(g->OUTPUT2[2], g->outPrefix, "retained.2");
+ closeOutput(g->OUTPUT2[3], g->outPrefix, "discarded.2");
+
+ fprintf(stderr, " bBelow bNormal bHigh bDiscarded\n");
+ fprintf(stderr, "aBelow %8lu %8lu %8lu %8lu\n", g->thresholdCounts[0][0], g->thresholdCounts[0][1], g->thresholdCounts[0][2], g->thresholdCounts[0][3]);
+ fprintf(stderr, "aNormal %8lu %8lu %8lu %8lu\n", g->thresholdCounts[1][0], g->thresholdCounts[1][1], g->thresholdCounts[1][2], g->thresholdCounts[1][3]);
+ fprintf(stderr, "aHigh %8lu %8lu %8lu %8lu\n", g->thresholdCounts[2][0], g->thresholdCounts[2][1], g->thresholdCounts[2][2], g->thresholdCounts[2][3]);
+ fprintf(stderr, "aDiscarded %8lu %8lu %8lu %8lu\n", g->thresholdCounts[3][0], g->thresholdCounts[3][1], g->thresholdCounts[3][2], g->thresholdCounts[3][3]);
+
+ if (g->outputHistogram != NULL) {
+ FILE *H = fopen(g->outputHistogram, "w");
+
+ fprintf(H, "# amount of sequence retained\n");
+ for (uint32 i=0; i<1001; i++)
+ if (g->scoreHistogram[i] > 0)
+ fprintf(H, "%.4f\t%u\n", i / 1000.0, g->scoreHistogram[i]);
+
+ fclose(H);
+ }
+
+ delete g;
+
+ exit(0);
+}
diff --git a/libkmer/merTable.H b/libkmer/merTable.H
new file mode 100644
index 0000000..7588957
--- /dev/null
+++ b/libkmer/merTable.H
@@ -0,0 +1,76 @@
+#ifndef MERTABLE_H
+#define MERTABLE_H
+
+// The obvious simple small mer table, appropriate for large sequences
+
+#error merTable should be unused
+
+class merTable {
+public:
+ merTable() {
+ };
+ ~merTable() {
+ delete [] merToPositions;
+ delete [] positions;
+ };
+
+ void build(seqStream *CS, uint32 merSize, uint32 merSkip=0) {
+
+ // Allocate the mer table
+ //
+ uint32 tableSize = uint32ONE << (2*merSize);
+ fprintf(stderr, "allocate "uint32FMT" entries for a merTable.\n", tableSize);
+
+ merToPositions = new uint32 [tableSize+1];
+
+ // First pass, count the number of times we see each mer
+ //
+ for (uint32 i=0; i<=tableSize; i++)
+ merToPositions[i] = 0;
+
+ merStream MS(merSize, CS);
+
+ while (MS.nextMer(merSkip)) {
+ uint64 m = (uint64)MS.theFMer();
+ //fprintf(stderr, "add mer "uint64FMT"\n", m);
+ merToPositions[m]++;
+ }
+
+ // Convert those to indexes into positions - m[i] is the start of
+ // the locations in positions[] where positions are stored.
+ //
+ for (uint32 pos=0, val=0, i=0; i<=tableSize; i++) {
+ val = merToPositions[i];
+ merToPositions[i] = pos;
+ pos += val;
+ }
+
+ // Allocate space
+ //
+ fprintf(stderr, "allocate "uint32FMT" entries for positions\n", merToPositions[tableSize]);
+ positions = new uint32 [merToPositions[tableSize]];
+
+ // Second pass, fill in positions
+ //
+ MS.rewind();
+
+ while (MS.nextMer(merSkip))
+ positions[ merToPositions[(uint64)MS.theFMer()]++ ] = MS.thePositionInStream();
+ };
+
+ uint32 numberOfPositions(uint64 mer) {
+ return(merToPositions[mer+1] - merToPositions[mer]);
+ };
+
+ uint32 getPosition(uint64 mer, uint32 index) {
+ if (index >= merToPositions[mer+1] - merToPositions[mer])
+ return(~uint32ZERO);
+ return(merToPositions[mer] + index);
+ };
+
+private:
+ uint32 *merToPositions; // index into positions[]; merToPositions[mer] is the first base in the mer
+ uint32 *positions; // list of positions for mers, sorted by mer
+};
+
+#endif // MERTABLE_H
diff --git a/libkmer/percentCovered.C b/libkmer/percentCovered.C
new file mode 100644
index 0000000..947ff30
--- /dev/null
+++ b/libkmer/percentCovered.C
@@ -0,0 +1,66 @@
+#include "util++.H"
+#include "bio++.H"
+#include "existDB.H"
+
+#include "seqCache.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+int
+main(int argc, char **argv) {
+ char *merFile = 0L;
+ char *queryFile = 0L;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-m") == 0) {
+ merFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-q") == 0) {
+ queryFile = argv[++arg];
+ } else {
+ fprintf(stderr, "Unknown arg '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ existDB *E = new existDB(merFile, 22, existDBnoFlags, 0, ~uint32ZERO);
+ seqCache *Q = new seqCache(queryFile);
+ seqInCore *S = Q->getSequenceInCore();
+
+ intervalList<uint64> IL;
+ speedCounter SC(" %8f frags (%8.5f frags/sec)\r", 1, 1000, true);
+
+ while (S) {
+ merStream *MS = new merStream(new kMerBuilder(22),
+ new seqStream(S->sequence(), S->sequenceLength()),
+ true, true);
+
+ IL.clear();
+
+ while (MS->nextMer()) {
+ if (E->exists(MS->theFMer())) {
+ IL.add(MS->thePositionInSequence(), 22);
+ }
+ }
+
+ IL.merge();
+
+ if (IL.sumOfLengths() > 0) {
+ fprintf(stdout, "%5.2f\n",
+ 100.0 * IL.sumOfLengths() / (double)S->sequenceLength());
+ }
+
+ delete MS;
+ delete S;
+
+ SC.tick();
+
+ S = Q->getSequenceInCore();
+ }
+
+ delete Q;
+ delete E;
+
+ return(0);
+}
+
diff --git a/libkmer/positionDB-access.C b/libkmer/positionDB-access.C
new file mode 100644
index 0000000..de3cb58
--- /dev/null
+++ b/libkmer/positionDB-access.C
@@ -0,0 +1,344 @@
+#include "bio++.H"
+#include "positionDB.H"
+
+
+void
+positionDB::reallocateSpace(uint64*& posn,
+ uint64& posnMax,
+ uint64& posnLen,
+ uint64 len) {
+
+ if (posnMax < posnLen + len) {
+ uint64 *pp;
+
+ posnMax = posnLen + len + (len >> 2);
+
+ if (posnMax == 0)
+ posnMax = 16384;
+
+ try {
+ pp = new uint64 [posnMax];
+ } catch (...) {
+ fprintf(stderr, "positionDB::get()-- Can't allocate space for more positions, requested "uint64FMT" uint64's.\n", posnMax);
+ abort();
+ }
+
+ memcpy(pp, posn, sizeof(uint64) * posnLen);
+
+ delete [] posn;
+ posn = pp;
+ }
+}
+
+
+
+void
+positionDB::loadPositions(uint64 J,
+ uint64*& posn,
+ uint64& posnMax,
+ uint64& posnLen,
+ uint64& count) {
+
+ uint64 sizs[3] = {_pptrWidth, 1, _sizeWidth};
+ uint64 vals[3] = {0, 0, 1};
+
+ getDecodedValues(_buckets, J + _chckWidth, (_sizeWidth == 0) ? 2 : 3, sizs, vals);
+
+ // If the size is stored, the count is updated to the correct
+ // thing. If it's not stored, the count is set to 1 by the default
+ // value of vals[2], and reset after we get the number of positions
+ // stored.
+ //
+ count = vals[2];
+
+ if (vals[1]) {
+ reallocateSpace(posn, posnMax, posnLen, 64);
+ posn[posnLen++] = vals[0];
+ } else {
+ uint64 ptr = vals[0] * _posnWidth;
+ uint64 len = getDecodedValue(_positions, ptr, _posnWidth);
+
+ if (_sizeWidth == 0)
+ count = len;
+
+ reallocateSpace(posn, posnMax, posnLen, len + 64);
+
+ for (ptr += _posnWidth; len > 0; ptr += _posnWidth, len--)
+ posn[posnLen++] = getDecodedValue(_positions, ptr, _posnWidth);
+ }
+}
+
+
+
+bool
+positionDB::getExact(uint64 mer,
+ uint64*& posn,
+ uint64& posnMax,
+ uint64& posnLen,
+ uint64& count) {
+ uint64 h = HASH(mer);
+ uint64 c = CHECK(mer);
+ uint64 st, ed;
+
+ if (_hashTable_BP) {
+ st = getDecodedValue(_hashTable_BP, h * _hashWidth, _hashWidth);
+ ed = getDecodedValue(_hashTable_BP, h * _hashWidth + _hashWidth, _hashWidth);
+ } else {
+ st = _hashTable_FW[h];
+ ed = _hashTable_FW[h+1];
+ }
+
+ posnLen = 0;
+
+ if (st == ed)
+ return(false);
+
+ for (uint64 i=st, J=st * _wFin; i<ed; i++, J += _wFin) {
+ if (c == getDecodedValue(_buckets, J, _chckWidth)) {
+ loadPositions(J, posn, posnMax, posnLen, count);
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+
+bool
+positionDB::existsExact(uint64 mer) {
+ uint64 h = HASH(mer);
+ uint64 c = CHECK(mer);
+ uint64 st, ed;
+
+ if (_hashTable_BP) {
+ st = getDecodedValue(_hashTable_BP, h * _hashWidth, _hashWidth);
+ ed = getDecodedValue(_hashTable_BP, h * _hashWidth + _hashWidth, _hashWidth);
+ } else {
+ st = _hashTable_FW[h];
+ ed = _hashTable_FW[h+1];
+ }
+
+ if (st == ed)
+ return(false);
+
+ for (uint64 i=st, J=st * _wFin; i<ed; i++, J += _wFin)
+ if (c == getDecodedValue(_buckets, J, _chckWidth))
+ return(true);
+
+ return(false);
+}
+
+
+uint64
+positionDB::countExact(uint64 mer) {
+ uint64 h = HASH(mer);
+ uint64 c = CHECK(mer);
+ uint64 st, ed;
+
+ if (_hashTable_BP) {
+ st = getDecodedValue(_hashTable_BP, h * _hashWidth, _hashWidth);
+ ed = getDecodedValue(_hashTable_BP, h * _hashWidth + _hashWidth, _hashWidth);
+ } else {
+ st = _hashTable_FW[h];
+ ed = _hashTable_FW[h+1];
+ }
+
+ if (st == ed)
+ return(0);
+
+ for (uint64 i=st, J=st * _wFin; i<ed; i++, J += _wFin) {
+ if (c == getDecodedValue(_buckets, J, _chckWidth)) {
+ uint64 sizs[3] = {_pptrWidth, 1, _sizeWidth};
+ uint64 vals[3] = {0};
+
+ getDecodedValues(_buckets, J + _chckWidth, 3, sizs, vals);
+
+ if (_sizeWidth > 0)
+ return(vals[2]);
+
+ if (vals[1])
+ return(1);
+
+ return(getDecodedValue(_positions, vals[0] * _posnWidth, _posnWidth));
+ }
+ }
+
+ return(0);
+}
+
+
+uint64
+positionDB::setCount(uint64 mer, uint64 count) {
+ uint64 h = HASH(mer);
+ uint64 c = CHECK(mer);
+ uint64 st, ed;
+
+ if (_hashTable_BP) {
+ st = getDecodedValue(_hashTable_BP, h * _hashWidth, _hashWidth);
+ ed = getDecodedValue(_hashTable_BP, h * _hashWidth + _hashWidth, _hashWidth);
+ } else {
+ st = _hashTable_FW[h];
+ ed = _hashTable_FW[h+1];
+ }
+
+ if (st == ed)
+ return(0);
+
+ for (uint64 i=st, J=st * _wFin; i<ed; i++, J += _wFin)
+ if (c == getDecodedValue(_buckets, J, _chckWidth)) {
+ setDecodedValue(_buckets, J + _chckWidth + _pptrWidth + 1, _sizeWidth, count);
+ return(count);
+ }
+
+ return(0);
+}
+
+
+
+void
+positionDB::filter(uint64 lo,
+ uint64 hi) {
+ uint64 st=0, ed=0; // iteration through buckets
+ uint64 nb=0; // bit position of the current (read) bucket and next (write) bucket
+ uint64 np=0; // bit position of the current (read) position and next (write) position
+ uint64 vv;
+
+ uint64 loCount = 0;
+ uint64 okCount = 0;
+ uint64 hiCount = 0;
+
+ uint64 sizs[4] = {_chckWidth, _pptrWidth, 1, _sizeWidth};
+ uint64 vals[4] = {0, 0, 0, 0};
+
+ //dump("posDB.before");
+
+ fprintf(stderr, "positionDB::filter()-- Filtering out kmers less than "uint64FMT" and more than "uint64FMT"\n", lo, hi);
+
+ if (_sizeWidth == 0) {
+ // Single copy mers in a table without counts can be multi-copy
+ // when combined with their reverse-complement mer.
+ fprintf(stderr, "positionDB::filter()-- ERROR!\n");
+ fprintf(stderr, "positionDB::filter()-- ERROR! No count information; filtering will break canonical assumptions.\n");
+ fprintf(stderr, "positionDB::filter()-- ERROR!\n");
+ exit(1);
+ }
+
+ // Grab the start of the first (current) bucket. We reset the
+ // hashTable at the end of the loop, forcing us to keep st
+ // up-to-date, instead of grabbing it anew each iteration.
+ //
+ if (_hashTable_BP)
+ st = getDecodedValue(_hashTable_BP, 0, _hashWidth);
+ else
+ st = _hashTable_FW[0];
+
+ // Over all buckets
+ //
+ for (uint64 h=0; h<_tableSizeInEntries; h++) {
+
+ // Grab the end of this bucket - the end is always for the
+ // current structure. This gets reset at the end of the loop.
+ //
+ if (_hashTable_BP)
+ ed = getDecodedValue(_hashTable_BP, h * _hashWidth + _hashWidth, _hashWidth);
+ else
+ ed = _hashTable_FW[h+1];
+
+ // Over all entries in the bucket
+ //
+ while (st < ed) {
+ uint64 cb = st * _wFin;
+
+ getDecodedValues(_buckets, cb, (_sizeWidth == 0) ? 3 : 4, sizs, vals);
+
+ // Argh. Tricky. We need to grab the count stored in the
+ // table, but if it's a single mer, there is no count.
+
+ uint64 count = 1; // Real count over the whole data set
+ uint64 len = 1; // Number of times the mer occurs in this subset
+ uint64 cp = ~uint64ZERO; // current position pointer, used to copy position information
+
+ // If not a unique mer in this table, len and cp are defined.
+ if (vals[2] == 0) {
+ cp = vals[1] * _posnWidth;
+ len = getDecodedValue(_positions, cp, _posnWidth);
+ count = len;
+ }
+
+ // The size stored in the bucket is to be believed
+ if (_sizeWidth > 0)
+ count = vals[3];
+
+ // What happened here: By default, the count is 1. If it is
+ // NOT a unique mer in the table, we reset the count to the
+ // number of entries in the table. Then, if there is a count
+ // stored in the table, we reset the count again.
+
+ // Move on to copying the data, if in the correct range.
+
+ if (vals[2] == 1) {
+ // Is a single mer in our table. Copy if the actual count is
+ // acceptable.
+ if ((lo <= count) && (count < hi)) {
+ okCount++;
+ setDecodedValues(_buckets, nb, (_sizeWidth == 0) ? 3 : 4, sizs, vals);
+ nb += _wFin;
+ } else {
+ _numberOfDistinct--;
+ _numberOfMers--;
+ loCount++;
+ }
+ } else {
+ // Mer has more than one location in the table. Copy all
+ // locations if the count is acceptable.
+ if ((lo <= count) && (count < hi)) {
+ okCount++;
+
+ // Copy the bucket
+ vals[1] = np / _posnWidth;
+ setDecodedValues(_buckets, nb, (_sizeWidth == 0) ? 3 : 4, sizs, vals);
+ nb += _wFin;
+
+ // Copy length of the positions
+ if (cp != np)
+ setDecodedValue(_positions, np, _posnWidth, len);
+ np += _posnWidth;
+ cp += _posnWidth;
+
+ // Copy positions
+ while (len > 0) {
+ if (cp != np)
+ setDecodedValue(_positions, np, _posnWidth,
+ getDecodedValue(_positions, cp, _posnWidth));
+ np += _posnWidth;
+ cp += _posnWidth;
+ len--;
+ }
+ } else {
+ // Not acceptable count
+ _numberOfDistinct--;
+ _numberOfEntries -= len;
+ if (count < lo) loCount++;
+ if (count > hi) hiCount++;
+ }
+ }
+
+ // Move to the next entry
+ st++;
+ cb += _wFin;
+ } // Over all entries in the bucket
+
+ // Update the end position of this bucket
+ if (_hashTable_BP)
+ setDecodedValue(_hashTable_BP, h * _hashWidth + _hashWidth, _hashWidth, nb / _wFin);
+ else
+ _hashTable_FW[h+1] = nb / _wFin;
+
+ } // Over all buckets
+
+ fprintf(stderr, "positionDB::filter()-- Filtered "uint64FMT" kmers less than "uint64FMT"\n", loCount, lo);
+ fprintf(stderr, "positionDB::filter()-- Filtered "uint64FMT" kmers more than "uint64FMT"\n", hiCount, hi);
+ fprintf(stderr, "positionDB::filter()-- Saved "uint64FMT" kmers with acceptable count\n", okCount);
+
+ //dump("posDB.after");
+}
diff --git a/libkmer/positionDB-dump.C b/libkmer/positionDB-dump.C
new file mode 100644
index 0000000..f6633da
--- /dev/null
+++ b/libkmer/positionDB-dump.C
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "positionDB.H"
+#include "bio++.H"
+
+
+void
+positionDB::dump(char *name) {
+ uint64 sizs[4] = {_chckWidth, _pptrWidth, 1, _sizeWidth};
+ uint64 vals[4] = {0, 0, 0, 0};
+ FILE *F = fopen(name, "w");
+
+ for (uint64 h=0; h<_tableSizeInEntries; h++) {
+ uint64 st, ed;
+
+ if (_hashTable_BP) {
+ st = getDecodedValue(_hashTable_BP, h * _hashWidth, _hashWidth);
+ ed = getDecodedValue(_hashTable_BP, h * _hashWidth + _hashWidth, _hashWidth);
+ } else {
+ st = _hashTable_FW[h];
+ ed = _hashTable_FW[h+1];
+ }
+
+ fprintf(F, "B "uint64FMT" "uint64FMT"-"uint64FMT"\n", h, st, ed);
+
+ while (st < ed) {
+ uint64 cb = st * _wFin;
+
+ getDecodedValues(_buckets, cb, (_sizeWidth == 0) ? 3 : 4, sizs, vals);
+
+ fprintf(F, "%c chk="uint64HEX" pos="uint64FMT" siz="uint64FMT,
+ (vals[2] == 0) ? 'D' : 'U', vals[0], vals[1], vals[3]);
+
+ if (vals[2] == 0) {
+ uint64 pos = vals[1] * _posnWidth;
+ uint64 len = getDecodedValue(_positions, pos, _posnWidth);
+
+ for (pos += _posnWidth; len > 0; pos += _posnWidth, len--)
+ fprintf(F, " "uint64FMT, getDecodedValue(_positions, pos, _posnWidth));
+ }
+
+ fprintf(F, "\n");
+
+ st++;
+ }
+ }
+
+ fclose(F);
+}
diff --git a/libkmer/positionDB-file.C b/libkmer/positionDB-file.C
new file mode 100644
index 0000000..d8f7af2
--- /dev/null
+++ b/libkmer/positionDB-file.C
@@ -0,0 +1,211 @@
+#include "positionDB.H"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+static
+char magic[16] = { 'p', 'o', 's', 'i', 't', 'i', 'o', 'n', 'D', 'B', '.', 'v', '1', ' ', ' ', ' ' };
+static
+char faild[16] = { 'p', 'o', 's', 'i', 't', 'i', 'o', 'n', 'D', 'B', 'f', 'a', 'i', 'l', 'e', 'd' };
+
+void
+positionDB::saveState(char const *filename) {
+
+ fprintf(stderr, "Saving positionDB to '%s'\n", filename);
+
+ errno = 0;
+ int F = open(filename, O_RDWR | O_CREAT | O_LARGEFILE,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno) {
+ fprintf(stderr, "Can't open '%s' for writing positionDB.\n%s\n", filename, strerror(errno));
+ exit(1);
+ }
+
+ bool magicFirst = false;
+
+ // Test if this is a pipe. If so, we write the magic first,
+ // otherwise we write the magic last.
+ //
+ errno = 0;
+ lseek(F, 0, SEEK_SET);
+ if (errno == ESPIPE)
+ magicFirst = true;
+
+ if (magicFirst)
+ write(F, magic, sizeof(char) * 16);
+ else
+ write(F, faild, sizeof(char) * 16);
+
+ if (errno) {
+ fprintf(stderr, "positionDB::saveState()-- Write failure on magic first.\n%s\n", strerror(errno));
+ exit(1);
+ }
+
+ // If only to be completely annoying and anal, we clear the
+ // pointers before we write the data. Sure, we could just write
+ // the stuff we care about, but this is easier. This is easier.
+ // Before you go rip out this stuff, remember that you can now
+ // checksum the resulting files. So don't do it.
+ //
+ uint32 *bs = _bucketSizes;
+ uint64 *cb = _countingBuckets;
+ uint64 *hp = _hashTable_BP;
+ uint32 *hw = _hashTable_FW;
+ uint64 *bu = _buckets;
+ uint64 *ps = _positions;
+ uint64 *he = _hashedErrors;
+
+ _bucketSizes = 0L;
+ _countingBuckets = 0L;
+ _hashTable_BP = (uint64 *)((_hashTable_BP) ? uint64ONE : uint64ZERO);
+ _hashTable_FW = (uint32 *)((_hashTable_FW) ? uint32ONE : uint32ZERO);
+ _buckets = 0L;
+ _positions = 0L;
+ _hashedErrors = 0L;
+
+ safeWrite(F, this, "this", sizeof(positionDB) * 1);
+
+ _bucketSizes = bs;
+ _countingBuckets = cb;
+ _hashTable_BP = hp;
+ _hashTable_FW = hw;
+ _buckets = bu;
+ _positions = ps;
+ _hashedErrors = he;
+
+ if (_hashTable_BP) {
+ safeWrite(F, _hashTable_BP, "_hashTable_BP", sizeof(uint64) * (_tableSizeInEntries * _hashWidth / 64 + 1));
+ } else {
+ safeWrite(F, _hashTable_FW, "_hashTable_FW", sizeof(uint32) * (_tableSizeInEntries + 1));
+ }
+
+ safeWrite(F, _buckets, "_buckets", sizeof(uint64) * (_numberOfDistinct * _wFin / 64 + 1));
+ safeWrite(F, _positions, "_positions", sizeof(uint64) * (_numberOfEntries * _posnWidth / 64 + 1));
+ safeWrite(F, _hashedErrors, "_hashedErrors", sizeof(uint64) * (_hashedErrorsLen));
+
+ if (magicFirst == false) {
+ lseek(F, 0, SEEK_SET);
+ if (errno) {
+ fprintf(stderr, "positionDB::saveState()-- Failed to seek to start of file -- write failed.\n%s\n", strerror(errno));
+ exit(1);
+ }
+
+ write(F, magic, sizeof(char) * 16);
+ if (errno) {
+ fprintf(stderr, "positionDB::saveState()-- Write failure on magic last.\n%s\n", strerror(errno));
+ exit(1);
+ }
+ }
+
+ close(F);
+}
+
+
+bool
+positionDB::loadState(char const *filename, bool beNoisy, bool loadData) {
+ char cigam[16] = { 0 };
+
+ fprintf(stderr, "Loading positionDB from '%s'\n", filename);
+
+ errno = 0;
+ int F = open(filename, O_RDONLY | O_LARGEFILE, 0);
+ if (errno) {
+ fprintf(stderr, "Can't open '%s' for reading pre-built positionDB: %s\n", filename, strerror(errno));
+ return(false);
+ }
+
+ safeRead(F, cigam, "Magic Number", sizeof(char) * 16);
+
+ if (strncmp(faild, cigam, 16) == 0) {
+ if (beNoisy) {
+ fprintf(stderr, "positionDB::loadState()-- Incomplete positionDB binary file.\n");
+ fprintf(stderr, "positionDB::loadState()-- Read '%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c'\n",
+ cigam[0], cigam[1], cigam[2], cigam[3],
+ cigam[4], cigam[5], cigam[6], cigam[7],
+ cigam[8], cigam[9], cigam[10], cigam[11],
+ cigam[12], cigam[13], cigam[14], cigam[15]);
+ fprintf(stderr, "positionDB::loadState()-- Expected '%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c'\n",
+ magic[0], magic[1], magic[2], magic[3],
+ magic[4], magic[5], magic[6], magic[7],
+ magic[8], magic[9], magic[10], magic[11],
+ magic[12], magic[13], magic[14], magic[15]);
+ }
+ close(F);
+ return(false);
+ } else if (strncmp(magic, cigam, 16) != 0) {
+ if (beNoisy) {
+ fprintf(stderr, "positionDB::loadState()-- Not a positionDB binary file, maybe a sequence file?\n");
+ fprintf(stderr, "positionDB::loadState()-- Read '%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c'\n",
+ cigam[0], cigam[1], cigam[2], cigam[3],
+ cigam[4], cigam[5], cigam[6], cigam[7],
+ cigam[8], cigam[9], cigam[10], cigam[11],
+ cigam[12], cigam[13], cigam[14], cigam[15]);
+ fprintf(stderr, "positionDB::loadState()-- Expected '%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c'\n",
+ magic[0], magic[1], magic[2], magic[3],
+ magic[4], magic[5], magic[6], magic[7],
+ magic[8], magic[9], magic[10], magic[11],
+ magic[12], magic[13], magic[14], magic[15]);
+ }
+
+ close(F);
+ return(false);
+ }
+
+ safeRead(F, this, "positionDB", sizeof(positionDB) * 1);
+
+ _bucketSizes = 0L;
+ _countingBuckets = 0L;
+ _buckets = 0L;
+ _positions = 0L;
+ _hashedErrors = 0L;
+
+ if (loadData) {
+ uint64 hs = _tableSizeInEntries * _hashWidth / 64 + 1;
+ uint64 bs = _numberOfDistinct * _wFin / 64 + 1;
+ uint64 ps = _numberOfEntries * _posnWidth / 64 + 1;
+
+ if (_hashTable_BP) {
+ _hashTable_BP = new uint64 [hs];
+ _hashTable_FW = 0L;
+ safeRead(F, _hashTable_BP, "_hashTable_BP", sizeof(uint64) * hs);
+ } else {
+ _hashTable_BP = 0L;
+ _hashTable_FW = new uint32 [_tableSizeInEntries + 1];
+ safeRead(F, _hashTable_FW, "_hashTable_FW", sizeof(uint32) * (_tableSizeInEntries + 1));
+ }
+
+ _buckets = new uint64 [bs];
+ _positions = new uint64 [ps];
+ _hashedErrors = new uint64 [_hashedErrorsMax];
+
+ safeRead(F, _buckets, "_buckets", sizeof(uint64) * bs);
+ safeRead(F, _positions, "_positions", sizeof(uint64) * ps);
+ safeRead(F, _hashedErrors, "_hashedErrors", sizeof(uint64) * _hashedErrorsLen);
+ }
+
+ close(F);
+
+ return(true);
+}
+
+
+
+void
+positionDB::printState(FILE *stream) {
+ fprintf(stream, "merSizeInBases: "uint32FMT"\n", _merSizeInBases);
+ fprintf(stream, "merSkipInBases: "uint32FMT"\n", _merSkipInBases);
+ fprintf(stream, "tableSizeInBits: "uint32FMT"\n", _tableSizeInBits);
+ fprintf(stream, "tableSizeInEntries: "uint64FMT"\n", _tableSizeInEntries);
+ fprintf(stream, "hashWidth: "uint32FMT"\n", _hashWidth);
+ fprintf(stream, "chckWidth: "uint32FMT"\n", _chckWidth);
+ fprintf(stream, "posnWidth: "uint32FMT"\n", _posnWidth);
+ fprintf(stream, "numberOfMers: "uint64FMT"\n", _numberOfMers);
+ fprintf(stream, "numberOfPositions: "uint64FMT"\n", _numberOfPositions);
+ fprintf(stream, "numberOfDistinct: "uint64FMT"\n", _numberOfDistinct);
+ fprintf(stream, "numberOfUnique: "uint64FMT"\n", _numberOfUnique);
+ fprintf(stream, "numberOfEntries: "uint64FMT"\n", _numberOfEntries);
+ fprintf(stream, "maximumEntries: "uint64FMT"\n", _maximumEntries);
+}
+
diff --git a/libkmer/positionDB-mismatch.C b/libkmer/positionDB-mismatch.C
new file mode 100644
index 0000000..464b82d
--- /dev/null
+++ b/libkmer/positionDB-mismatch.C
@@ -0,0 +1,388 @@
+#include "positionDB.H"
+#include "bio++.H"
+
+
+static
+int
+stringscmp(const void *A, const void *B) {
+ uint64 const a = *(uint64 const *)A;
+ uint64 const b = *(uint64 const *)B;
+ if (a < b) return(-1);
+ if (a > b) return(1);
+ return(0);
+}
+
+
+static
+uint32
+makeUnique(uint64 *strings, uint32 stringsLen) {
+ qsort(strings, stringsLen, sizeof(uint64), stringscmp);
+ uint32 len = 0;
+ uint32 nxt = 1;
+ while (nxt < stringsLen) {
+ if (strings[len] != strings[nxt]) {
+ len++;
+ strings[len] = strings[nxt];
+ }
+ nxt++;
+ }
+ return(len+1);
+}
+
+
+#if 0
+// debug
+static
+void
+dumpPatterns(uint64 *strings, uint32 stringsLen, uint32 ts) {
+ for (uint32 i=0; i<stringsLen; i++) {
+ char str[1024] = {0};
+ uint32 cnt = 0;
+
+ for (uint32 b=0; b<ts; b++) {
+ if (strings[i] & (uint64ONE << b)) {
+ str[b] = '1';
+ cnt++;
+ } else {
+ str[b] = '0';
+ }
+ }
+
+ fprintf(stdout, "%s\t"uint32FMT"\n", str, cnt);
+ }
+}
+#endif
+
+
+double
+positionDB::setUpMismatchMatcher(uint32 nErrorsAllowed, uint64 approxMers) {
+
+ // Build an xor mask that will generate all errors for a given
+ // mersize.
+
+ _nErrorsAllowed = nErrorsAllowed;
+ _hashedErrorsLen = 0;
+ _hashedErrorsMax = 0;
+ _hashedErrors = 0L;
+
+ uint32 stringsMax = 128 * 1024 * 1024;
+ uint32 stringsLen = 0;
+ uint64 *strings = new uint64 [stringsMax];
+
+ uint64 totpat = 0;
+ uint64 toterr = 0;
+
+ uint64 m1, m2, m3, m4, m5, m6;
+ uint64 *e1, *e2, *e3, *e4, *e5, *e6;
+
+ {
+ // This can be trivially eliminated by replacing e1[x] with the err[] statement.
+ uint32 ne = 3;
+ for (uint32 x=1; x<_nErrorsAllowed; x++)
+ ne *= 3;
+
+ //fprintf(stderr, "Storing ne="uint32FMT" errors.\n", ne);
+
+ e1 = new uint64 [ne];
+ e2 = new uint64 [ne];
+ e3 = new uint64 [ne];
+ e4 = new uint64 [ne];
+ e5 = new uint64 [ne];
+ e6 = new uint64 [ne];
+
+ uint64 err[3] = { 0x5555555555555555llu, 0xaaaaaaaaaaaaaaaallu, 0xffffffffffffffffllu };
+
+ for (uint32 x=0; x<ne; x++) {
+ e1[x] = err[(x/ 1) % 3];
+ e2[x] = err[(x/ 3) % 3];
+ e3[x] = err[(x/ 9) % 3];
+ e4[x] = err[(x/ 27) % 3];
+ e5[x] = err[(x/ 81) % 3];
+ e6[x] = err[(x/243) % 3];
+ }
+ }
+
+
+ // Zero errors
+ strings[stringsLen++] = uint64ZERO;
+
+
+ // One error
+ if (1 <= _nErrorsAllowed) {
+ for (uint32 ai=0; ai<_merSizeInBases; ai++) {
+ totpat++;
+ toterr += 3;
+ m1 = 0x03llu << (ai * 2);
+
+ for (uint32 x=0; x<3; x++)
+ strings[stringsLen++] = HASH((m1 & e1[x]));
+ }
+
+ stringsLen = makeUnique(strings, stringsLen);
+ stringsLen = makeUnique(strings, stringsLen);
+ //dumpPatterns(strings, stringsLen, _tableSizeInBits);
+ //fprintf(stderr, "DONE1 totpat="uint64FMT" toterr="uint64FMT" stringsLen="uint32FMT"\n", totpat, toterr, stringsLen);
+ }
+
+
+ // Two errors
+ if (2 <= _nErrorsAllowed) {
+ for (uint32 ai=0; ai<_merSizeInBases; ai++)
+ for (uint32 bi=0; bi<ai; bi++) {
+ totpat++;
+ toterr += 9;
+ m1 = 0x03llu << (ai * 2);
+ m2 = 0x03llu << (bi * 2);
+
+ for (uint32 x=0; x<9; x++)
+ strings[stringsLen++] = HASH((m1 & e1[x]) ^ (m2 & e2[x]));
+ }
+
+ stringsLen = makeUnique(strings, stringsLen);
+ stringsLen = makeUnique(strings, stringsLen);
+ //dumpPatterns(strings, stringsLen, _tableSizeInBits);
+ //fprintf(stderr, "DONE2 totpat="uint64FMT" toterr="uint64FMT" stringsLen="uint32FMT"\n", totpat, toterr, stringsLen);
+ }
+
+
+ // Three errors
+ if (3 <= _nErrorsAllowed) {
+ for (uint32 ai=0; ai<_merSizeInBases; ai++)
+ for (uint32 bi=0; bi<ai; bi++)
+ for (uint32 ci=0; ci<bi; ci++) {
+ totpat++;
+ toterr += 27;
+ m1 = 0x03llu << (ai * 2);
+ m2 = 0x03llu << (bi * 2);
+ m3 = 0x03llu << (ci * 2);
+
+ for (uint32 x=0; x<27; x++)
+ strings[stringsLen++] = HASH((m1 & e1[x]) ^ (m2 & e2[x]) ^ (m3 & e3[x]));
+ }
+
+ stringsLen = makeUnique(strings, stringsLen);
+ stringsLen = makeUnique(strings, stringsLen);
+ //dumpPatterns(strings, stringsLen, _tableSizeInBits);
+ //fprintf(stderr, "DONE3 totpat="uint64FMT" toterr="uint64FMT" stringsLen="uint32FMT"\n", totpat, toterr, stringsLen);
+ }
+
+
+ // Four errors
+ if (4 <= _nErrorsAllowed) {
+ for (uint32 ai=0; ai<_merSizeInBases; ai++)
+ for (uint32 bi=0; bi<ai; bi++)
+ for (uint32 ci=0; ci<bi; ci++)
+ for (uint32 di=0; di<ci; di++) {
+ totpat++;
+ toterr += 81;
+ m1 = 0x03llu << (ai * 2);
+ m2 = 0x03llu << (bi * 2);
+ m3 = 0x03llu << (ci * 2);
+ m4 = 0x03llu << (di * 2);
+
+ for (uint32 x=0; x<81; x++)
+ strings[stringsLen++] = HASH((m1 & e1[x]) ^ (m2 & e2[x]) ^ (m3 & e3[x]) ^ (m4 & e4[x]));
+ }
+
+ stringsLen = makeUnique(strings, stringsLen);
+ stringsLen = makeUnique(strings, stringsLen);
+ //dumpPatterns(strings, stringsLen, _tableSizeInBits);
+ //fprintf(stderr, "DONE4 totpat="uint64FMT" toterr="uint64FMT" stringsLen="uint32FMT"\n", totpat, toterr, stringsLen);
+ }
+
+
+ // Five errors
+ if (5 <= _nErrorsAllowed) {
+ for (uint32 ai=0; ai<_merSizeInBases; ai++)
+ for (uint32 bi=0; bi<ai; bi++)
+ for (uint32 ci=0; ci<bi; ci++)
+ for (uint32 di=0; di<ci; di++)
+ for (uint32 ei=0; ei<di; ei++) {
+ totpat++;
+ toterr += 243;
+ m1 = 0x03llu << (ai * 2);
+ m2 = 0x03llu << (bi * 2);
+ m3 = 0x03llu << (ci * 2);
+ m4 = 0x03llu << (di * 2);
+ m5 = 0x03llu << (ei * 2);
+
+ if (stringsLen + 32000 >= stringsMax)
+ stringsLen = makeUnique(strings, stringsLen);
+
+ for (uint32 x=0; x<243; x++)
+ strings[stringsLen++] = HASH((m1 & e1[x]) ^ (m2 & e2[x]) ^ (m3 & e3[x]) ^ (m4 & e4[x]) ^ (m5 & e5[x]));
+ }
+
+ stringsLen = makeUnique(strings, stringsLen);
+ stringsLen = makeUnique(strings, stringsLen);
+ //dumpPatterns(strings, stringsLen, _tableSizeInBits);
+ //fprintf(stderr, "DONE5 totpat="uint64FMT" toterr="uint64FMT" stringsLen="uint32FMT"\n", totpat, toterr, stringsLen);
+ }
+
+
+ // Six errors
+ if (6 <= _nErrorsAllowed) {
+ for (uint32 ai=0; ai<_merSizeInBases; ai++)
+ for (uint32 bi=0; bi<ai; bi++)
+ for (uint32 ci=0; ci<bi; ci++)
+ for (uint32 di=0; di<ci; di++)
+ for (uint32 ei=0; ei<di; ei++)
+ for (uint32 fi=0; fi<ei; fi++) {
+ totpat++;
+ toterr += 729;
+ m1 = 0x03llu << (ai * 2);
+ m2 = 0x03llu << (bi * 2);
+ m3 = 0x03llu << (ci * 2);
+ m4 = 0x03llu << (di * 2);
+ m5 = 0x03llu << (ei * 2);
+ m6 = 0x03llu << (fi * 2);
+
+ if (stringsLen + 32000 >= stringsMax)
+ stringsLen = makeUnique(strings, stringsLen);
+
+ for (uint32 x=0; x<729; x++)
+ strings[stringsLen++] = HASH((m1 & e1[x]) ^ (m2 & e2[x]) ^ (m3 & e3[x]) ^ (m4 & e4[x]) ^ (m5 & e5[x]) ^ (m6 & e6[x]));
+ }
+
+ stringsLen = makeUnique(strings, stringsLen);
+ stringsLen = makeUnique(strings, stringsLen);
+ //dumpPatterns(strings, stringsLen, _tableSizeInBits);
+ //fprintf(stderr, "DONE6 totpat="uint64FMT" toterr="uint64FMT" stringsLen="uint32FMT"\n", totpat, toterr, stringsLen);
+ }
+
+
+ if (7 <= _nErrorsAllowed) {
+ fprintf(stderr, "Only 6 errors allowed.\n");
+ exit(1);
+ }
+
+ for (uint32 i=1; i<stringsLen; i++) {
+ assert((strings[i] & ~_hashMask) == 0);
+ assert(strings[i] != 0);
+ }
+
+ delete [] e1;
+ delete [] e2;
+ delete [] e3;
+ delete [] e4;
+ delete [] e5;
+ delete [] e6;
+
+ delete [] _hashedErrors;
+
+ _hashedErrorsLen = stringsLen;
+ _hashedErrorsMax = stringsLen;
+ _hashedErrors = new uint64 [_hashedErrorsLen];
+
+ memcpy(_hashedErrors, strings, sizeof(uint64) * _hashedErrorsLen);
+
+ delete [] strings;
+
+#ifdef UNCOMPRESS_HASH_TABLE
+ // Cost is just bucket searching.
+ double work = (double)_hashedErrorsLen * approxMers / _tableSizeInEntries;
+#else
+ // Cost is bucket searching + hash table lookups.
+ double work = (double)_hashedErrorsLen * approxMers / _tableSizeInEntries + 2.0 * _hashedErrorsLen;
+#endif
+
+ //fprintf(stderr, "Built "uint32FMT" hashed errors at tableSize "uint32FMT" (work=%f.0).\n",
+ // _hashedErrorsLen,
+ // _tableSizeInBits,
+ // work);
+
+ //for (uint32 i=0; i<_hashedErrorsLen; i++)
+ // fprintf(stderr, "he["uint32FMTW(5)"] = "uint64HEX"\n", i, _hashedErrors[i]);
+
+ return(work);
+}
+
+
+
+// Returns hits with _AT_MOST_ numMismatches mistakes.
+bool
+positionDB::getUpToNMismatches(uint64 mer,
+ uint32 numMismatches,
+ uint64*& posn,
+ uint64& posnMax,
+ uint64& posnLen) {
+
+ PREFETCH(_hashedErrors); // Slightly better.
+
+ posnLen = 0;
+
+ if (_hashedErrors == 0L) {
+ fprintf(stderr, "ERROR: Nobody initialized getUpToNMismatches() by calling setUpMismatchMatcher().\n");
+ exit(1);
+ }
+
+ if (posnMax == 0) {
+ posnMax = 16384;
+ try {
+ posn = new uint64 [posnMax];
+ } catch (...) {
+ fprintf(stderr, "positionDB::getUpToNMismatches()-- Can't allocate space for initial positions, requested "uint64FMT" uint64's.\n", posnMax);
+ abort();
+ }
+ }
+
+ uint64 orig = HASH(mer);
+
+ // Optimization that didn't work. The idea was to compute all the
+ // hashes with errors, then sort to gain better cache locality in
+ // the lookups. The sort dominated.
+ //
+ // Another: Surprisingly, theq two getDecodedValue calls are faster
+ // than a single getDecodedValues.
+
+ for (uint32 e=0; e<_hashedErrorsLen; e++) {
+ uint64 hash = orig ^ _hashedErrors[e];
+ uint64 st, ed;
+
+ if (_hashTable_BP) {
+ st = getDecodedValue(_hashTable_BP, hash * _hashWidth, _hashWidth);
+ ed = getDecodedValue(_hashTable_BP, hash * _hashWidth + _hashWidth, _hashWidth);
+ } else {
+ st = _hashTable_FW[hash];
+ ed = _hashTable_FW[hash+1];
+ }
+
+ assert((_hashedErrors[e] & ~_hashMask) == 0);
+ assert((hash & ~_hashMask) == 0);
+
+ // Rebuild the mer from the hash and its check code.
+ //
+ // Compare the rebuilt mer and the original mer -- if there are
+ // exactly N errors, it's a hit! (if there are fewer than N,
+ // we'll find it when we look for N-1 errors).
+ //
+ // Before rebuilding, compute diffs on the chckBits only -- if
+ // things are wildly different (the usual case) we'll get
+ // enough difference here to abort. Remember, the chck bits
+ // are not encoded, they're an exact copy from the unhashed
+ // mer.
+
+ if (st != ed) {
+ for (uint64 i=ed-st, J=st * _wFin; i--; J += _wFin) {
+ uint64 chck = getDecodedValue(_buckets, J, _chckWidth);
+ uint64 diffs = chck ^ (mer & _mask2);
+ uint64 d1 = diffs & uint64NUMBER(0x5555555555555555);
+ uint64 d2 = diffs & uint64NUMBER(0xaaaaaaaaaaaaaaaa);
+ uint64 err = countNumberOfSetBits64(d1 | (d2 >> 1));
+
+ if (err <= numMismatches) {
+ diffs = REBUILD(hash, chck) ^ mer;
+ d1 = diffs & uint64NUMBER(0x5555555555555555);
+ d2 = diffs & uint64NUMBER(0xaaaaaaaaaaaaaaaa);
+ err = countNumberOfSetBits64(d1 | (d2 >> 1));
+
+ if (err <= numMismatches)
+ // err is junk, just need a parameter here
+ loadPositions(J, posn, posnMax, posnLen, err);
+ }
+ }
+ }
+ }
+
+ return(posnLen > 0);
+}
diff --git a/libkmer/positionDB-sort.C b/libkmer/positionDB-sort.C
new file mode 100644
index 0000000..76cb9c5
--- /dev/null
+++ b/libkmer/positionDB-sort.C
@@ -0,0 +1,150 @@
+#include "positionDB.H"
+#include "bio++.H"
+
+
+void
+adjustHeap(uint64 *C,
+ uint64 *P, int64 i, int64 n) {
+ uint64 c = C[i];
+ uint64 p = P[i];
+ int64 j = (i << 1) + 1; // let j be the left child
+
+ while (j < n) {
+ if (j<n-1 && C[j] < C[j+1])
+ j++; // j is the larger child
+
+ if (c >= C[j]) // a position for M[i] has been found
+ break;
+
+ C[(j-1)/2] = C[j]; // Move larger child up a level
+ P[(j-1)/2] = P[j];
+
+ j = (j << 1) + 1;
+ }
+
+ C[(j-1)/2] = c;
+ P[(j-1)/2] = p;
+}
+
+
+void
+positionDB::sortAndRepackBucket(uint64 b) {
+ uint64 st = _bucketSizes[b];
+ uint64 ed = _bucketSizes[b+1];
+ uint32 le = (uint32)(ed - st);
+
+ if (ed < st)
+ fprintf(stdout, "ERROR: Bucket "uint64FMT" starts at "uint64FMT" ends at "uint64FMT"?\n", b, st, ed);
+
+ if (le == 0)
+ return;
+
+ // One mer in the list? It's distinct and unique! (and doesn't
+ // contribute to the position list space count)
+ //
+ if (le == 1) {
+ _numberOfDistinct++;
+ _numberOfUnique++;
+ return;
+ }
+
+ // Allocate more space, if we need to.
+ //
+ if (_sortedMax <= le) {
+ _sortedMax = le + 1024;
+ delete [] _sortedChck;
+ delete [] _sortedPosn;
+ _sortedChck = new uint64 [_sortedMax];
+ _sortedPosn = new uint64 [_sortedMax];
+ }
+
+ // Unpack the bucket
+ //
+ uint64 lens[3] = {_chckWidth, _posnWidth, 1 + _sizeWidth};
+ uint64 vals[3] = {0};
+ for (uint64 i=st, J=st * _wCnt; i<ed; i++, J += _wCnt) {
+ getDecodedValues(_countingBuckets, J, 2, lens, vals);
+ _sortedChck[i-st] = vals[0];
+ _sortedPosn[i-st] = vals[1];
+ }
+
+ // Create the heap of lines.
+ //
+ int unsetBucket = 0;
+
+ for (int64 t=(le-2)/2; t>=0; t--) {
+ if (_sortedPosn[t] == uint64MASK(_posnWidth)) {
+ unsetBucket = 1;
+ fprintf(stdout, "ERROR: unset posn bucket="uint64FMT" t="int64FMT" le="uint32FMT"\n", b, t, le);
+ }
+
+ adjustHeap(_sortedChck, _sortedPosn, t, le);
+ }
+
+ if (unsetBucket)
+ for (uint32 t=0; t<le; t++)
+ fprintf(stdout, uint32FMTW(4)"] chck="uint64HEX" posn="uint64FMT"\n", t, _sortedChck[t], _sortedPosn[t]);
+
+ // Interchange the new maximum with the element at the end of the tree
+ //
+ for (int64 t=le-1; t>0; t--) {
+ uint64 tc = _sortedChck[t];
+ uint64 tp = _sortedPosn[t];
+
+ _sortedChck[t] = _sortedChck[0];
+ _sortedPosn[t] = _sortedPosn[0];
+
+ _sortedChck[0] = tc;
+ _sortedPosn[0] = tp;
+
+ adjustHeap(_sortedChck, _sortedPosn, 0, t);
+ }
+
+ // Scan the list of sorted mers, counting the number of distinct and unique,
+ // and the space needed in the position list.
+
+ uint64 entries = 1; // For t=0
+
+ for (uint32 t=1; t<le; t++) {
+ if (_sortedChck[t-1] > _sortedChck[t])
+ fprintf(stdout, "ERROR: bucket="uint64FMT" t="uint32FMT" le="uint32FMT": "uint64HEX" > "uint64HEX"\n",
+ b, t, le, _sortedChck[t-1], _sortedChck[t]);
+
+ if (_sortedChck[t-1] != _sortedChck[t]) {
+ _numberOfDistinct++;
+
+ if (_maximumEntries < entries)
+ _maximumEntries = entries;
+
+ if (entries == 1)
+ _numberOfUnique++;
+ else
+ _numberOfEntries += entries + 1; // +1 for the length
+
+ entries = 0;
+ }
+
+ entries++;
+ }
+
+ // Don't forget the last mer!
+ //
+ _numberOfDistinct++;
+ if (_maximumEntries < entries)
+ _maximumEntries = entries;
+ if (entries == 1)
+ _numberOfUnique++;
+ else
+ _numberOfEntries += entries + 1;
+
+
+ // Repack the sorted entries
+ //
+ for (uint64 i=st, J=st * _wCnt; i<ed; i++, J += _wCnt) {
+ vals[0] = _sortedChck[i-st];
+ vals[1] = _sortedPosn[i-st];
+ vals[2] = 0;
+ setDecodedValues(_countingBuckets, J, 3, lens, vals);
+ }
+}
+
diff --git a/libkmer/positionDB.C b/libkmer/positionDB.C
new file mode 100644
index 0000000..a9e927e
--- /dev/null
+++ b/libkmer/positionDB.C
@@ -0,0 +1,1125 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <new>
+
+#include "bio++.H"
+#include "positionDB.H"
+#include "existDB.H"
+#include "libmeryl.H"
+
+#undef ERROR_CHECK_COUNTING
+#undef ERROR_CHECK_COUNTING_ENCODING
+#undef ERROR_CHECK_EMPTY_BUCKETS
+
+// This tests Chunlin Xiao's discovered bug -- if there are a small
+// number of unique mers, compared to distinct mers (2 * #unique_mers
+// < #distinct_mers, we would overflow the position pointer in
+// buckets. This enables a check that it doesn't occur.
+//
+// This has a fixed allocation size, and crashes on larger inputs.
+//
+#undef TEST_NASTY_BUGS
+
+// Tests that mers are masked out properly. Doesn't handle canonical
+// mers though.
+//
+#undef MER_REMOVAL_TEST
+
+
+positionDB::positionDB(char const *filename,
+ uint32 merSize,
+ uint32 merSkip,
+ uint32 maxMismatch,
+ bool loadData) {
+ memset(this, 0, sizeof(positionDB));
+
+ // loadData == false only for driver-posDB.C, and only so it can
+ // dump stats on a posDB file.
+
+ if (loadState(filename, true, false) == false) {
+ fprintf(stderr, "positionDB()-- Tried to read state from '%s', but failed.\n", filename);
+ exit(1);
+ }
+
+ if ((loadData) && (merSize != _merSizeInBases)) {
+ fprintf(stderr, "positionDB()-- Tried to read state from '%s', but mer size is wrong (found "uint32FMT", wanted "uint32FMT").\n",
+ filename, _merSizeInBases, merSize);
+ exit(1);
+ }
+
+ if ((loadData) && (merSkip != _merSkipInBases)) {
+ fprintf(stderr, "positionDB()-- Tried to read state from '%s', but mer skip is wrong (found "uint32FMT", wanted "uint32FMT").\n",
+ filename, _merSkipInBases, merSkip);
+ exit(1);
+ }
+
+if ((loadData) && (maxMismatch != _nErrorsAllowed)) {
+ fprintf(stderr, "positionDB()-- Tried to read state from '%s', but max number of mismatches is wrong (found "uint32FMT", wanted "uint32FMT").\n",
+ filename, _nErrorsAllowed, maxMismatch);
+ exit(1);
+ }
+
+ if (loadState(filename, true, loadData) == false) {
+ fprintf(stderr, "positionDB()-- Tried to read state from '%s', but failed.\n", filename);
+ exit(1);
+ }
+}
+
+
+positionDB::positionDB(merStream *MS,
+ uint32 merSize,
+ uint32 merSkip,
+ existDB *mask,
+ existDB *only,
+ merylStreamReader *counts,
+ uint32 minCount,
+ uint32 maxCount,
+ uint32 maxMismatch,
+ uint32 maxMemory,
+ bool beVerbose) {
+
+ memset(this, 0, sizeof(positionDB));
+
+ // Guesstimate a nice table size based on the number of input mers
+ // and the mersize, unless the user gave us a table size.
+ //
+ // We need to ensure that
+ // 2 * merSize + posnWidth + 1 - 64 <= tblBits <= 2 * merSize - 4
+ //
+ // The catch is that we don't exactly know posnWidth right now. We
+ // can overestimate it, though, based on the size of the sequence
+ // that is backing the merStream.
+ //
+ // The second catch is that we don't want to make tblBits too big
+ // or too small. If too big, we waste a lot of memory in the hash
+ // table pointers, and if too small, we waste even more memory in
+ // the data table (not to mention the algorithm dies because it
+ // assumed buckets in the data table are small).
+ //
+ // The memory size is (roughly):
+ //
+ // 2^tblBits * log(numDistinctMers) +
+ // numDistinctMers * (2*merSize - tblBits + 1 + log(numMers) +
+ // (numMers - numUniqieMers) * log(numMers)
+ //
+ // this is approximately proportional to:
+ //
+ // 2^tblBits * posnWidth +
+ // approxMers * (2*merSize - tblBits + 1 + posnWidth)
+ //
+ uint64 approxMers = MS->approximateNumberOfMers();
+ uint64 posnWidth = logBaseTwo64(approxMers + 1);
+
+ // Find the smallest and largest tblBits we could possibly use.
+ //
+ uint64 sm = 2 * merSize + posnWidth + 1 - 64;
+ uint64 lg = 2 * merSize - 4;
+
+ if (2 * merSize + posnWidth + 1 < 64)
+ sm = 2;
+
+ if (sm < 16)
+ sm = 16;
+
+ if (sm > lg) {
+ fprintf(stderr, "ERROR: too many mers for this mersize!\n");
+ fprintf(stderr, " sm = "uint64FMT"\n", sm);
+ fprintf(stderr, " lg = "uint64FMT"\n", lg);
+ fprintf(stderr, " merSize = "uint32FMT" bits\n", 2 * merSize);
+ fprintf(stderr, " approxMers = "uint64FMT" mers\n", approxMers);
+ fprintf(stderr, " posnWidth = "uint64FMT" bits\n", posnWidth);
+ exit(1);
+ }
+
+
+ // Iterate through all the choices, picking the one with the
+ // smallest expected footprint.
+ //
+ {
+
+ if (beVerbose) {
+ fprintf(stderr, "potential configurations for approximately "uint64FMT" "uint32FMT"-mers (posnW="uint64FMT").\n",
+ approxMers, merSize, posnWidth);
+ }
+
+ uint64 mini = 0; // tblSize of the smallest found
+ uint64 minm = ~mini; // memory size of the smallest found
+ double minw = 0.0; // work of the smallest found
+
+ uint64 memory = 0;
+ double effort = 0;
+
+ if (maxMemory == 0)
+ maxMemory = ~uint32ZERO;
+
+ for (uint64 i=sm; i<=lg; i++) {
+
+ // These are only needed if maxMismatch is set, but it's
+ // simpler to always set.
+ //
+ _merSizeInBases = merSize;
+ _merSizeInBits = 2 * _merSizeInBases;
+ _merSkipInBases = merSkip;
+ _tableSizeInBits = i;
+ _tableSizeInEntries = uint64ONE << _tableSizeInBits;
+ _hashWidth = uint32ZERO;
+ _hashMask = uint64MASK(_tableSizeInBits);
+ _chckWidth = _merSizeInBits - _tableSizeInBits;
+ _posnWidth = uint64ZERO;
+ _sizeWidth = 0;
+
+ _shift1 = _merSizeInBits - _tableSizeInBits;
+ _shift2 = _shift1 / 2;
+ _mask1 = uint64MASK(_tableSizeInBits);
+ _mask2 = uint64MASK(_shift1);
+
+ // Everyone wants to know the memory size (in MB).
+ //
+ memory = ((uint64ONE << i) * posnWidth + approxMers * (2*merSize - i + 1 + posnWidth)) >> 23;
+
+ // If we know we're looking for mismatches, we compute the amount
+ // of work needed per lookup, and use that, instead of strict
+ // memory sizing, to deicde the table size.
+ //
+ if (maxMismatch > 0)
+ effort = setUpMismatchMatcher(maxMismatch, approxMers);
+
+ // If our memory size is smaller than allowed, AND it's the
+ // smallest, or the work is smaller, save the table size.
+ //
+ if ((memory < maxMemory) &&
+ ((memory < minm) ||
+ (effort < minw))) {
+ mini = i;
+ minm = memory;
+ minw = effort;
+ }
+
+ if (beVerbose) {
+ fprintf(stderr, "tblBits="uint64FMTW(2)" shifts="uint32FMTW(02)","uint32FMTW(02)" -- size %8.3fGB -- work %8.3f%s\n",
+ i, _shift1, _shift2, memory / 1024.0, effort, (mini == i) ? " ***" : "");
+ }
+ }
+
+ _tableSizeInBits = mini;
+ }
+
+
+ if (_tableSizeInBits == 0) {
+ fprintf(stderr, "ERROR: No positionDB parameters within allowed memory limit.\n");
+ exit(1);
+ }
+
+
+ if (beVerbose) {
+ uint32 s1 = 2*merSize-_tableSizeInBits;
+ fprintf(stderr, "tblBits="uint32FMT" s1="uint32FMT" s2="uint32FMT" -- merSize="uint32FMT" bits + posnWidth="uint64FMT" bits (est "uint64FMT" mers) FINAL\n",
+ _tableSizeInBits, s1, s1/2, merSize, posnWidth, approxMers);
+ }
+
+
+ _merSizeInBases = merSize;
+ _merSizeInBits = 2 * _merSizeInBases;
+ _merSkipInBases = merSkip;
+ _tableSizeInEntries = uint64ONE << _tableSizeInBits;
+ _hashWidth = uint32ZERO;
+ _hashMask = uint64MASK(_tableSizeInBits);
+ _chckWidth = _merSizeInBits - _tableSizeInBits;
+ _posnWidth = uint64ZERO;
+ _sizeWidth = 0;
+
+ if (maxCount == 0)
+ maxCount = ~uint32ZERO;
+
+ if (counts)
+ _sizeWidth = (maxCount < ~uint32ZERO) ? logBaseTwo64(maxCount+1) : 32;
+
+ _shift1 = _merSizeInBits - _tableSizeInBits;
+ _shift2 = _shift1 / 2;
+ _mask1 = uint64MASK(_tableSizeInBits);
+ _mask2 = uint64MASK(_shift1);
+
+#if 0
+ fprintf(stderr, "merSizeInBits "uint32FMT"\n", _merSizeInBits);
+ fprintf(stderr, "hashWidth "uint32FMT"\n", _hashWidth);
+ fprintf(stderr, "chckWidth "uint32FMT"\n", _chckWidth);
+ fprintf(stderr, "shift1 "uint32FMT"\n", _shift1);
+ fprintf(stderr, "shift2 "uint32FMT"\n", _shift2);
+#endif
+
+ if (maxMismatch > 0)
+ setUpMismatchMatcher(maxMismatch, approxMers);
+
+ build(MS, mask, only, counts, minCount, maxCount, beVerbose);
+}
+
+
+
+void
+positionDB::build(merStream *MS,
+ existDB *mask,
+ existDB *only,
+ merylStreamReader *counts,
+ uint32 minCount,
+ uint32 maxCount,
+ bool beVerbose) {
+
+ _bucketSizes = 0L;
+ _countingBuckets = 0L;
+ _hashTable_BP = 0L;
+ _hashTable_FW = 0L;
+ _buckets = 0L;
+ _positions = 0L;
+
+ _wCnt = 0;
+ _wFin = 0;
+
+ // For get/setDecodedValues().
+ uint64 lensC[4] = {~uint64ZERO, ~uint64ZERO, ~uint64ZERO, ~uint64ZERO};
+ uint64 lensF[4] = {~uint64ZERO, ~uint64ZERO, ~uint64ZERO, ~uint64ZERO};
+ uint64 vals[4] = {0};
+ uint64 nval = (_sizeWidth == 0) ? 3 : 4;
+
+ _numberOfMers = uint64ZERO;
+ _numberOfPositions = uint64ZERO;
+ _numberOfDistinct = uint64ZERO;
+ _numberOfUnique = uint64ZERO;
+ _numberOfEntries = uint64ZERO;
+ _maximumEntries = uint64ZERO;
+
+ // We assume later that these are already allocated.
+ _sortedMax = 16384;
+ _sortedChck = new uint64 [_sortedMax];
+ _sortedPosn = new uint64 [_sortedMax];
+
+ if (MS == 0L) {
+ fprintf(stderr, "positionDB()-- ERROR: No merStream? Nothing to build a table with!\n");
+ exit(1);
+ }
+
+ MS->rewind();
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 1) Count bucket sizes
+ //
+
+ // We'll later want to reuse the _bucketSizes space for storing the
+ // hash table. To make it somewhat safe, we allocate the space as
+ // uint64, then cast it to be uint32.
+ //
+ // bktAllocIsJunk tells us if we should release this memory (if we
+ // need to allocate separate space for the hash table). We'd need
+ // to do this if the hashWidth is more than 32 bits, but we won't
+ // know that for a little bit.
+ //
+ // The _bucketSizes is offset by one from bktAlloc so that we don't
+ // overwrite _bucketSizes when we are constructing hash table.
+ //
+ uint64 *bktAlloc;
+ try {
+ bktAlloc = new uint64 [_tableSizeInEntries / 2 + 4];
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "positionDB()-- caught std::bad_alloc in %s at line %d\n", __FILE__, __LINE__);
+ fprintf(stderr, "positionDB()-- bktAlloc = new uint64 ["uint64FMT"]\n", _tableSizeInEntries / 2 + 4);
+ exit(1);
+ }
+ bool bktAllocIsJunk = false;
+
+ bzero(bktAlloc, sizeof(uint64) * (_tableSizeInEntries / 2 + 4));
+
+ // Why +2? We try to reuse the bktAlloc space for the hash table,
+ // which is constructed from the bucketSizes. The hashTable is
+ // built from the bucketSizes. It definitely needs to be +1, and
+ // so we use +2 just in case the human is being stupid again.
+ //
+ _bucketSizes = (uint32 *)(bktAlloc + 2);
+
+#ifdef ERROR_CHECK_COUNTING
+ fprintf(stdout, "ERROR_CHECK_COUNTING is defined.\n");
+ uint32 *_errbucketSizes = new uint32 [_tableSizeInEntries + 2];
+ for (uint64 i=0; i<_tableSizeInEntries + 2; i++)
+ _errbucketSizes[i] = uint32ZERO;
+#endif
+
+ if (beVerbose)
+ fprintf(stderr, " Allocated bucket size counting space with total size "uint64FMT" KB\n", _tableSizeInEntries >> 8);
+
+
+ speedCounter *C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, beVerbose);
+
+ // Two choices here
+ //
+ // 1) No masking or onlying is done. Stream the mers and just
+ // count the positions. This is the original behavior.
+ //
+ // 2) Masking or onlying is done. Open the output stream file,
+ // stream the mers by, checking for mask/only of both
+ // forward and reverse mers. If either is found, push
+ // the (forward) mer and position onto the stream.
+ // close the output stream.
+ //
+ // Save the mer if it doesn't exist in the mask (both f and r),
+ // or does exist in the only (either f or r), add it.
+ //
+ // The input databases for mask and only are (currently) made
+ // using canonical mers. We halve the number of exists() by
+ // also using canonical mers here.
+ //
+
+ MS->rewind();
+
+ while (MS->nextMer(_merSkipInBases)) {
+ _bucketSizes[ HASH(MS->theFMer()) ]++;
+
+#ifdef ERROR_CHECK_COUNTING
+ _errbucketSizes[ HASH(MS->theFMer()) ]++;
+#endif
+
+ _numberOfMers++;
+ _numberOfPositions = MS->thePositionInStream();
+ assert((_numberOfPositions >> 60) == 0);
+ C->tick();
+ }
+
+
+ delete C;
+ C = 0L;
+
+ if (beVerbose)
+ fprintf(stderr, " Found "uint64FMT" mers (max position = "uint64FMT")\n", _numberOfMers, _numberOfPositions);
+
+ // This caught a nasty bug in merStream rewind(), and it's pretty
+ // cheap, so I left it in. Search for the other DEBUGnumPositions.
+ //
+ uint64 DEBUGnumPositions = _numberOfPositions + 1;
+
+ // This is _numberOfMers+1 because we need to store the first
+ // position after the last mer. That is, if there are two mers, we
+ // will store that the first mer is at position 0, the second mer
+ // is at position 1, and the end of the second mer is at position
+ // 2.
+ //
+ // In reality, it should be the number of distinct mers, not the
+ // total number of mers, but we don't know that yet. And so
+ // occasionally we'll make things too big and waste a bit of
+ // memory.
+ //
+ _hashWidth = logBaseTwo64(_numberOfMers+1);
+ _posnWidth = logBaseTwo64(_numberOfPositions+1);
+
+
+
+ ///////////////////////////////////////////////////////////////////////////////
+ //
+ // 2) Allocate buckets and make bucketSizes be a pointer into them
+ //
+ _wCnt = _chckWidth + _posnWidth + 1 + _sizeWidth;
+
+ lensC[0] = _chckWidth;
+ lensC[1] = _posnWidth;
+ lensC[2] = 1;
+ lensC[3] = _sizeWidth;
+
+ uint64 bucketsSpace = (_numberOfMers+1) * _wCnt / 64 + 1;
+ uint32 endPosition = 0;
+
+ if (beVerbose)
+ fprintf(stderr, " Allocated "uint64FMT"KB for buckets ("uint64FMT" 64-bit words)\n", bucketsSpace >> 7, bucketsSpace);
+ try {
+ _countingBuckets = new uint64 [bucketsSpace];
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "positionDB()-- caught std::bad_alloc in %s at line %d\n", __FILE__, __LINE__);
+ fprintf(stderr, "positionDB()-- _countingBuckets = new uint64 ["uint64FMT"]\n", bucketsSpace);
+ exit(1);
+ }
+
+ for (uint64 i=0; i<bucketsSpace; i++)
+ _countingBuckets[i] = ~uint64ZERO;
+
+ for (uint64 i=0; i<_tableSizeInEntries; i++) {
+ endPosition += _bucketSizes[i];
+ _bucketSizes[i] = endPosition;
+ }
+ _bucketSizes[_tableSizeInEntries] = endPosition;
+
+#ifdef ERROR_CHECK_COUNTING
+ if (endPosition != _numberOfMers)
+ fprintf(stdout, "ERROR_CHECK_COUNTING: BUCKETSIZE COUNTING PROBLEM -- endPos="uint32FMT" != numMers="uint64FMT"\n",
+ endPosition, _numberOfMers);
+#endif
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 3) Build list of mers with positions
+ //
+ if (beVerbose)
+ fprintf(stderr, " Building lists with positions.\n");
+
+ C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, beVerbose);
+
+#ifdef ERROR_CHECK_COUNTING_ENCODING
+ fprintf(stdout, "ERROR_CHECK_COUNTING_ENCODING is defined!\n");
+#endif
+
+
+ MS->rewind();
+
+ while (MS->nextMer(_merSkipInBases)) {
+ uint64 h = HASH(MS->theFMer());
+
+#ifdef ERROR_CHECK_COUNTING
+ if (_bucketSizes[h] == 0) {
+ char str[33];
+ fprintf(stderr, "positionDB()-- ERROR_CHECK_COUNTING: Bucket "uint64FMT" ran out of things! '%s'\n", h, MS->theFMer().merToString(str));
+ fprintf(stderr, "positionDB()-- ERROR_CHECK_COUNTING: Stream is at "uint64FMT"\n", MS->thePositionInStream());
+ }
+#endif
+
+ _bucketSizes[h]--;
+
+#ifdef ERROR_CHECK_COUNTING
+ _errbucketSizes[h]--;
+#endif
+
+
+#ifdef ERROR_CHECK_EMPTY_BUCKETS
+ // Check that everything is empty. Empty is defined as set to all 1's.
+ getDecodedValues(_countingBuckets, (uint64)_bucketSizes[h] * (uint64)_wCnt, nval, lensC, vals);
+
+ if (((~vals[0]) & uint64MASK(lensC[0])) ||
+ ((~vals[1]) & uint64MASK(lensC[1])) ||
+ ((~vals[2]) & uint64MASK(lensC[2])) ||
+ ((lensC[3] > 0) && ((~vals[3]) & uint64MASK(lensC[3]))))
+ fprintf(stdout, "ERROR_CHECK_EMPTY_BUCKETS: countingBucket not empty! pos=%lu 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n",
+ _bucketSizes[h] * _wCnt,
+ (~vals[0]) & uint64MASK(lensC[0]),
+ (~vals[1]) & uint64MASK(lensC[1]),
+ (~vals[2]) & uint64MASK(lensC[2]),
+ (~vals[3]) & uint64MASK(lensC[3]));
+#endif
+
+ vals[0] = CHECK(MS->theFMer());
+ vals[1] = MS->thePositionInStream();
+ vals[2] = 0;
+ vals[3] = 0;
+
+ setDecodedValues(_countingBuckets, (uint64)_bucketSizes[h] * (uint64)_wCnt, nval, lensC, vals);
+
+#ifdef ERROR_CHECK_COUNTING_ENCODING
+ getDecodedValues(_countingBuckets, (uint64)_bucketSizes[h] * (uint64)_wCnt, nval, lensC, vals);
+
+ if (vals[0] != CHECK(MS->theFMer()))
+ fprintf(stdout, "ERROR_CHECK_COUNTING_ENCODING error: CHCK corrupted! Wanted "uint64HEX" got "uint64HEX"\n",
+ CHECK(MS->theFMer()), vals[0]);
+ if (vals[1] != MS->thePositionInStream())
+ fprintf(stdout, "ERROR_CHECK_COUNTING_ENCODING error: POSN corrupted! Wanted "uint64HEX" got "uint64HEX"\n",
+ MS->thePositionInStream(), vals[1]);
+ if (vals[2] != 0)
+ fprintf(stdout, "ERROR_CHECK_COUNTING_ENCODING error: UNIQ corrupted.\n");
+ if (vals[3] != 0)
+ fprintf(stdout, "ERROR_CHECK_COUNTING_ENCODING error: SIZE corrupted.\n");
+#endif
+
+ C->tick();
+ }
+
+
+ delete C;
+ C = 0L;
+
+#ifdef ERROR_CHECK_COUNTING
+ for (uint64 i=0; i<_tableSizeInEntries; i++)
+ if (_errbucketSizes[i] != 0)
+ fprintf(stdout, "ERROR_CHECK_COUNTING: Bucket "uint32FMT" wasn't filled fully? "uint32FMT" left over.\n", i, _errbucketSizes[i]);
+
+ delete [] _errbucketSizes;
+ _errbucketSizes = 0L;
+#endif
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 4) Sort each bucket -- count:
+ // 1) number of distinct mers
+ // 2) number of unique mers
+ // 3) number of entries in position table ( sum mercount+1 for all mercounts > 1)
+ // also need to repack the sorted things
+ //
+ if (beVerbose)
+ fprintf(stderr, " Sorting and repacking buckets ("uint64FMT" buckets).\n", _tableSizeInEntries);
+
+ C = new speedCounter(" %7.2f Mbuckets -- %5.2f Mbuckets/second\r", 1000000.0, 0x1ffffff, beVerbose);
+ for (uint64 i=0; i<_tableSizeInEntries; i++) {
+ sortAndRepackBucket(i);
+ C->tick();
+ }
+ delete C;
+ C = 0L;
+
+ if (beVerbose)
+ fprintf(stderr,
+ " Found "uint64FMTW(12)" total mers\n"
+ " Found "uint64FMTW(12)" distinct mers\n"
+ " Found "uint64FMTW(12)" unique mers\n"
+ " Need "uint64FMT" non-unique position list entries ("uint64FMT" maximum count)\n",
+ _numberOfMers, _numberOfDistinct, _numberOfUnique, _numberOfEntries, _maximumEntries);
+
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // Compute the size of the final bucket position entry. It's
+ // either a position into the sequence, or a pointer into a list of
+ // positions. In rare cases, the pointer is larger than the
+ // sequence position, and we need to do extra work.
+ //
+ // The width of position pointers (in buckets) is the max of
+ // _posnWidth (a pointer to the sequence position) and
+ // _pptrWidth (a pointer to an entry in the positions table).
+ //
+ _pptrWidth = logBaseTwo64(_numberOfEntries+1);
+ if (_pptrWidth < _posnWidth)
+ _pptrWidth = _posnWidth;
+
+ _wFin = _chckWidth + _pptrWidth + 1 + _sizeWidth;
+
+ lensF[0] = _chckWidth;
+ lensF[1] = _pptrWidth;
+ lensF[2] = 1;
+ lensF[3] = _sizeWidth;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 5) Allocate: real hash table, buckets and position table.
+ //
+
+ // XXXX how do we count the number of buckets/positions we never
+ // use because they are masked out??
+ //
+ // If we are just thresholding (ignore things with count > 100)
+ // it's easy, a simple loop over something.
+ //
+ // If we have an exist/only db....are they in the same order? Can
+ // we loop over both at the same time and count that way? That'd
+ // be cool! Mersize is the same, why can the table size be the
+ // same too -- OK, if the existDB has a small number of mers in it,
+ // then we don't need a large table.
+
+ uint64 hs = _tableSizeInEntries * _hashWidth / 64 + 1;
+ uint64 bs = _numberOfDistinct * _wFin / 64 + 1;
+ uint64 ps = _numberOfEntries * _posnWidth / 64 + 1;
+
+ if (_hashWidth <= 32) {
+ if (beVerbose)
+ fprintf(stderr, " Reusing bucket counting space for hash table.\n");
+
+#ifdef UNCOMPRESS_HASH_TABLE
+ _hashTable_BP = 0L;
+ _hashTable_FW = (uint32 *)bktAlloc;
+#else
+ _hashTable_BP = bktAlloc;
+ _hashTable_FW = 0L;
+#endif
+
+ bktAllocIsJunk = false;
+ } else {
+
+ // Can't use the full-width hash table, since the data size is >
+ // 32 bits -- we'd need to allocate 64-bit ints for it, and
+ // that'll likely be too big...and we'd need to have
+ // _hashTable_FW64 or something.
+
+ if (beVerbose)
+ fprintf(stderr, " Allocated "uint64FMTW(10)"KB for hash table ("uint64FMT" 64-bit words)\n", hs >> 7, hs);
+ try {
+ _hashTable_BP = new uint64 [hs];
+ _hashTable_FW = 0L;
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "positionDB()-- caught std::bad_alloc in %s at line %d\n", __FILE__, __LINE__);
+ fprintf(stderr, "positionDB()-- _hashTable_BP = new uint64 ["uint64FMT"]\n", hs);
+ exit(1);
+ }
+ bktAllocIsJunk = true;
+ }
+
+
+ // If we have enough space to reuse the counting space, reuse it.
+ // Else, allocate more space.
+ //
+ // We need to ensure that there are enough bits and that the size
+ // of a bucket didn't increase. If the bucket size did increase,
+ // and we see more unique buckets than total mers (up to some
+ // point) we overwrite data.
+ //
+ // Recall that bucketSpace ~= numberOfMers * wCnt
+ //
+ if ((bs < bucketsSpace) && (_wFin <= _wCnt)) {
+ if (beVerbose)
+ fprintf(stderr, " Reusing bucket space; Have: "uint64FMT" Need: "uint64FMT" (64-bit words)\n", bucketsSpace, bs);
+
+ _buckets = _countingBuckets;
+
+ bs = bucketsSpace; // for output at the end
+ } else {
+ if (beVerbose)
+ fprintf(stderr, " Allocated "uint64FMTW(10)"KB for buckets ("uint64FMT" 64-bit words)\n", bs >> 7, bs);
+ try {
+ _buckets = new uint64 [bs];
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "positionDB()-- caught std::bad_alloc in %s at line %d\n", __FILE__, __LINE__);
+ fprintf(stderr, "positionDB()-- _buckets = new uint64 ["uint64FMT"]\n", bs);
+ exit(1);
+ }
+ }
+
+ if (beVerbose)
+ fprintf(stderr, " Allocated "uint64FMTW(10)"KB for positions ("uint64FMT" 64-bit words)\n", ps >> 7, ps);
+ try {
+ _positions = new uint64 [ps];
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "positionDB()-- caught std::bad_alloc in %s at line %d\n", __FILE__, __LINE__);
+ fprintf(stderr, "positionDB()-- _positions = new uint64 ["uint64FMT"\n", ps);
+ exit(1);
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //
+ // 6) Transfer from the sorted buckets to the hash table.
+ //
+ if (beVerbose)
+ fprintf(stderr, " Transferring to final structure ("uint64FMT" buckets).\n", _tableSizeInEntries);
+
+ uint64 bucketStartPosition = 0;
+
+ // Current positions and bit positions in the buckets and position list.
+ //
+ uint64 currentBbit = uint64ZERO; // Bit position into bucket
+ uint64 currentPbit = uint64ZERO; // Bit position into positions
+ uint64 currentPpos = uint64ZERO; // Value position into positions
+
+#ifdef TEST_NASTY_BUGS
+ // Save the position array pointer of each bucket for debugging.
+ //
+ uint64 currentBpos = uint64ZERO; // Value position into bucket
+ uint32 *posPtrCheck = new uint32 [65826038];
+#endif
+
+ // We also take this opportunity to reset some statistics that are
+ // wrong.
+ //
+ _numberOfMers = 0;
+ _numberOfPositions = 0;
+ _numberOfDistinct = 0;
+ _numberOfUnique = 0;
+ _numberOfEntries = 0;
+ _maximumEntries = 0;
+
+ C = new speedCounter(" %7.2f Mbuckets -- %5.2f Mbuckets/second\r", 1000000.0, 0x1ffffff, beVerbose);
+
+ // We need b outside the loop!
+ //
+ uint64 b;
+ for (b=0; b<_tableSizeInEntries; b++) {
+ C->tick();
+
+ // Set the start of the bucket -- we took pains to ensure that
+ // we don't overwrite _bucketSizes[b], if we are reusing that
+ // space for the hash table.
+ //
+ if (_hashTable_BP)
+ setDecodedValue(_hashTable_BP, (uint64)b * (uint64)_hashWidth, _hashWidth, bucketStartPosition);
+ else
+ _hashTable_FW[b] = bucketStartPosition;
+
+ // Get the number of mers in the counting bucket. The error
+ // checking and sizing of _sortedChck and _sortedPosn was already
+ // done in the sort.
+ //
+ uint64 st = _bucketSizes[b];
+ uint64 ed = _bucketSizes[b+1];
+ uint32 le = ed - st;
+
+ // Unpack the check values
+ //
+ for (uint64 i=st, J=st * _wCnt; i<ed; i++, J += _wCnt) {
+ getDecodedValues(_countingBuckets, J, 2, lensC, vals);
+ _sortedChck[i-st] = vals[0];
+ _sortedPosn[i-st] = vals[1];
+ }
+
+
+ // Walk through the counting bucket, adding things to the real
+ // bucket as we see them. Mers with more than one position are
+ // inserted into the bucket, and the positions inserted into the
+ // position list.
+
+ // start and end locations of the mer. For mers with only
+ // one occurrance (unique mers), stM+1 == edM.
+ //
+ uint32 stM = uint32ZERO;
+ uint32 edM = uint32ZERO;
+
+ while (stM < le) {
+
+ // Move to the next mer.
+ //
+ edM++;
+
+ // Keep moving while the two mers are the same.
+ //
+ while ((edM < le) && (_sortedChck[stM] == _sortedChck[edM]))
+ edM++;
+
+ // edM is now the mer after the last. Write all mers from stM
+ // up to edM to the final structure. If there is one mer, put
+ // it in the bucket. If not, put a pointer to the position
+ // array there.
+
+ // We're in bucket b, looking at mer _sortedChck[stM]. Ask the
+ // only/mask if that exists, if so do/do not include the mer.
+ //
+ bool useMer = true;
+
+ if (edM - stM < minCount)
+ useMer = false;
+
+ if (edM - stM > maxCount)
+ useMer = false;
+
+ if ((useMer == true) && (mask || only)) {
+
+ // MER_REMOVAL_DURING_XFER. Great. The existDB has
+ // (usually) the canonical mer. We have the forward mer.
+ // Well, no, we have the forward mers' hash and check. So,
+ // we reconstruct the mer, reverse complement it, and then
+ // throw the mer out if either the forward or reverse exists
+ // (or doesn't exist).
+
+ uint64 m = REBUILD(b, _sortedChck[stM]);
+ uint64 r;
+
+ if (mask) {
+ if (mask->isCanonical()) {
+ r = reverseComplementMer(_merSizeInBases, m);
+ if (r < m)
+ m = r;
+ }
+ if (mask->exists(m))
+ useMer = false;
+ }
+
+ if (only) {
+ if (only->isCanonical()) {
+ r = reverseComplementMer(_merSizeInBases, m);
+ if (r < m)
+ m = r;
+ }
+ if (only->exists(m) == false)
+ useMer = false;
+ }
+ }
+
+ if (useMer) {
+ _numberOfMers += edM - stM;
+ _numberOfPositions += edM - stM;
+ _numberOfDistinct++;
+
+ if (stM+1 == edM) {
+ _numberOfUnique++;
+
+#ifdef TEST_NASTY_BUGS
+ posPtrCheck[currentBpos++] = _sortedPosn[stM];
+#endif
+
+ vals[0] = _sortedChck[stM];
+ vals[1] = _sortedPosn[stM];
+ vals[2] = 1;
+ vals[3] = 0;
+
+ currentBbit = setDecodedValues(_buckets, currentBbit, nval, lensF, vals);
+ bucketStartPosition++;
+ } else {
+ _numberOfEntries += edM - stM;
+ if (_maximumEntries < edM - stM)
+ _maximumEntries = edM - stM;
+
+#ifdef TEST_NASTY_BUGS
+ posPtrCheck[currentBpos++] = currentPpos;
+#endif
+
+ vals[0] = _sortedChck[stM];
+ vals[1] = currentPpos;
+ vals[2] = 0;
+ vals[3] = 0;
+
+ currentBbit = setDecodedValues(_buckets, currentBbit, nval, lensF, vals);
+ bucketStartPosition++;
+
+ // Store the positions. Store the number of positions
+ // here, then store all positions.
+ //
+ // The positions are in the proper place in _sortedPosn,
+ // and setDecodedValue masks out the extra crap, so no
+ // temporary needed. Probably should be done with
+ // setDecodedValues, but then we need another array telling
+ // the sizes of each piece.
+ //
+ setDecodedValue(_positions, currentPbit, _posnWidth, edM - stM);
+ currentPbit += _posnWidth;
+ currentPpos++;
+
+ for (; stM < edM; stM++) {
+ if (_sortedPosn[stM] >= DEBUGnumPositions) {
+ fprintf(stderr, "positionDB()-- ERROR: Got position "uint64FMT", but only "uint64FMT" available!\n",
+ _sortedPosn[stM], DEBUGnumPositions);
+ abort();
+ }
+ setDecodedValue(_positions, currentPbit, _posnWidth, _sortedPosn[stM]);
+ currentPbit += _posnWidth;
+ currentPpos++;
+ }
+ }
+ } // useMer
+
+ // All done with this mer.
+ //
+ stM = edM;
+ } // while (stM < le)
+ } // for each bucket
+
+ // Set the end of the last bucket
+ //
+ if (_hashTable_BP)
+ setDecodedValue(_hashTable_BP, b * _hashWidth, _hashWidth, bucketStartPosition);
+ else
+ _hashTable_FW[b] = bucketStartPosition;
+
+ delete C;
+
+ // Clear out the end of the arrays -- this is only so that we can
+ // checksum the result.
+ //
+ if (_hashTable_BP) {
+ b = b * _hashWidth + _hashWidth;
+ setDecodedValue(_hashTable_BP, b, 64 - (b % 64), uint64ZERO);
+ }
+ setDecodedValue(_buckets, currentBbit, 64 - (currentBbit % 64), uint64ZERO);
+ setDecodedValue(_positions, currentPbit, 64 - (currentPbit % 64), uint64ZERO);
+
+
+ if (beVerbose) {
+ fprintf(stderr, " Avail: Bucket "uint64FMTW(12)" Position "uint64FMTW(12)" (64-bit words)\n", bs, ps);
+ fprintf(stderr, " Avail: Bucket "uint64FMTW(12)" Position "uint64FMTW(12)" (entries)\n", _numberOfDistinct, _numberOfEntries);
+ fprintf(stderr, " Used: Bucket "uint64FMTW(12)" Position "uint64FMTW(12)" (64-bit words)\n", currentBbit / 64, currentPbit / 64);
+ }
+
+ // Reset the sizes to what we actually found. If we then
+ // dump/reload, we shrink our footprint.
+ //
+ _numberOfDistinct = currentBbit / _wFin;
+ _numberOfEntries = currentPbit / _posnWidth;
+
+ if (beVerbose) {
+ fprintf(stderr, " Used: Bucket "uint64FMTW(12)" Position "uint64FMTW(12)" (entries)\n", _numberOfDistinct, _numberOfEntries);
+ fprintf(stderr,
+ " Found "uint64FMTW(12)" total mers\n"
+ " Found "uint64FMTW(12)" distinct mers\n"
+ " Found "uint64FMTW(12)" unique mers\n"
+ " Need "uint64FMT" non-unique position list entries ("uint64FMT" maximum count)\n",
+ _numberOfMers, _numberOfDistinct, _numberOfUnique, _numberOfEntries, _maximumEntries);
+ }
+
+
+ // If we removed mers, there is a small chance that our hash table
+ // is too big -- we might have removed enoough mers to make the
+ // width smaller. If so, rebuild the hash table.
+ //
+ // Also, hooray, we finally know the number of distinct mers, so we
+ // can make this nice and tight
+ //
+ if (_hashTable_BP) {
+ uint32 newHashWidth = 1;
+ while ((_numberOfDistinct+1) > (uint64ONE << newHashWidth))
+ newHashWidth++;
+
+ if (newHashWidth != _hashWidth) {
+ uint64 npos = 0;
+ uint64 opos = 0;
+
+ if (beVerbose)
+ fprintf(stderr, " Rebuilding the hash table, from "uint32FMT" bits wide to "uint32FMT" bits wide.\n",
+ _hashWidth, newHashWidth);
+
+ for (uint64 z=0; z<_tableSizeInEntries+1; z++) {
+ setDecodedValue(_hashTable_BP,
+ npos,
+ newHashWidth,
+ getDecodedValue(_hashTable_BP, opos, _hashWidth));
+ npos += newHashWidth;
+ opos += _hashWidth;
+ }
+
+ // Clear the end again.
+ setDecodedValue(_hashTable_BP, npos, 64 - (npos % 64), uint64ZERO);
+ }
+
+ _hashWidth = newHashWidth;
+ }
+
+
+ // If supplied, add in any counts. The meryl table is, sadly, in
+ // the wrong order, and we must hash and search.
+ //
+ // Meryl _should_ be storing only forward mers, but we have no way
+ // of checking.
+ //
+ // After all counts are loaded, check if we can compress the counts
+ // space any. Check if the largestMerylCount is much smaller than
+ // the space it is stored in. If so, we can compress the table.
+ //
+ uint64 largestMerylCount = 0;
+ uint64 countsLoaded = 0;
+
+ if (counts) {
+ if (beVerbose)
+ fprintf(stderr, " Loading "uint64FMT" mercounts.\n", counts->numberOfDistinctMers());
+
+ C = new speedCounter(" %7.2f Mmercounts -- %5.2f Mmercounts/second\r", 1000000.0, 0x1fffff, beVerbose);
+
+ while (counts->nextMer()) {
+ kMer k = counts->theFMer();
+ uint64 c = counts->theCount();
+ uint64 f = setCount(k, c);
+ k.reverseComplement();
+ uint64 r = setCount(k, c);
+
+ if (f + r > 0) {
+ countsLoaded++;
+ if (largestMerylCount < c)
+ largestMerylCount = c;
+ }
+
+ C->tick();
+ }
+
+ delete C;
+
+ if (beVerbose)
+ fprintf(stderr, " Loaded "uint64FMT" mercounts; largest is "uint64FMT".\n", countsLoaded, largestMerylCount);
+
+ if (logBaseTwo64(largestMerylCount + 1) < _sizeWidth) {
+ if (beVerbose)
+ fprintf(stderr, " Compress sizes from "uint32FMT" bits to "uint32FMT" bits.\n",
+ _sizeWidth,
+ (uint32)logBaseTwo64(largestMerylCount + 1));
+
+ uint64 oSiz[4] = { _chckWidth, _pptrWidth, 1, _sizeWidth };
+ uint64 nSiz[4] = { _chckWidth, _pptrWidth, 1, logBaseTwo64(largestMerylCount + 1) };
+ uint64 tVal[4] = { 0, 0, 0, 0 };
+
+ uint64 oP = 0, oS = oSiz[0] + oSiz[1] + oSiz[2] + oSiz[3];
+ uint64 nP = 0, nS = nSiz[0] + nSiz[1] + nSiz[2] + nSiz[3];
+
+ assert(nS < oS);
+
+ C = new speedCounter(" %7.2f Mmercounts -- %5.2f Mmercounts/second\r", 1000000.0, 0x1fffff, beVerbose);
+
+ for (uint64 bu=0; bu<_numberOfDistinct; bu++) {
+ getDecodedValues(_buckets, oP, 4, oSiz, tVal);
+ setDecodedValues(_buckets, nP, 4, nSiz, tVal);
+
+ oP += oS;
+ nP += nS;
+
+ C->tick();
+ }
+
+ delete C;
+
+ _sizeWidth = nSiz[3];
+ _wFin = _chckWidth + _pptrWidth + 1 + _sizeWidth;
+ }
+ }
+
+
+#ifdef TEST_NASTY_BUGS
+ // Unpack the bucket positions and check. Report the first one
+ // that is broken.
+ //
+ for(uint64 bb=0; bb<currentBpos; bb++)
+ if (posPtrCheck[bb] != getDecodedValue(_buckets, bb * _wFin + _chckWidth, _pptrWidth))
+ fprintf(stderr, "Bucket %lu (at bitpos %lu) failed position check (wanted %lu got %lu)\n",
+ bb,
+ bb * _wFin,
+ posPtrCheck[bb],
+ getDecodedValue(_buckets, bb * _wFin + _chckWidth, _pptrWidth));
+ delete [] posPtrCheck;
+#endif
+
+
+#ifdef MER_REMOVAL_TEST
+#warning MER_REMOVAL_TEST was not updated to deal with canonical mers
+ if (beVerbose)
+ fprintf(stderr, "positionDB()-- TESTING MER REMOVAL\n");
+
+ MS->rewind();
+ if (mask) {
+ C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, beVerbose);
+ uint32 extraMer = 0;
+ while (MS->nextMer(_merSkipInBases)) {
+ uint64 mer = MS->theFMer();
+ if (mask->exists(mer) && exists(mer))
+ extraMer++;
+ C->tick();
+ }
+ delete C;
+ fprintf(stderr, "positionDB()-- mask: "uint32FMT" mers extra!\n", extraMer);
+ } else if (only) {
+ C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, beVerbose);
+ uint32 missingMer = 0;
+ while (MS->nextMer(_merSkipInBases)) {
+ uint64 mer = MS->theFMer();
+ if (only->exists(mer) && !exists(mer))
+ missingMer++;
+ C->tick();
+ }
+ delete C;
+ fprintf(stderr, "positionDB()-- only: "uint32FMT" mers missing!\n", missingMer);
+ }
+#endif
+
+ // Free the counting buckets if we aren't using the space for
+ // something else.
+ //
+ if (_buckets != _countingBuckets)
+ delete [] _countingBuckets;
+
+ // In theory, we could move these to be immediately after the data
+ // is useless.
+ //
+ _bucketSizes = 0L;
+ _countingBuckets = 0L;
+
+ delete [] _sortedChck;
+ delete [] _sortedPosn;
+
+ _sortedMax = 0;
+ _sortedChck = 0L;
+ _sortedPosn = 0L;
+
+ if (bktAllocIsJunk)
+ delete [] bktAlloc;
+}
+
+positionDB::~positionDB() {
+ delete [] _hashTable_BP;
+ delete [] _hashTable_FW;
+ delete [] _buckets;
+ delete [] _positions;
+ delete [] _hashedErrors;
+}
diff --git a/libkmer/positionDB.H b/libkmer/positionDB.H
new file mode 100644
index 0000000..6fce300
--- /dev/null
+++ b/libkmer/positionDB.H
@@ -0,0 +1,241 @@
+#ifndef POSITIONDB_H
+#define POSITIONDB_H
+
+#include <stdio.h>
+#include "bio++.H"
+#include "merStream.H"
+
+// The two existDB inputs can be either forward or canonical. If
+// canonical, we are smart enough to search exist/only with the
+// canonical mer.
+
+// Returns position in posn, resizing it if needed. Space is
+// allocated if none supplied. The following is valid:
+//
+// uint64 *posn = 0L;
+// uint64 posnMax = 0;
+// uint64 posnLen = 0;
+// if (get(somemer, posn, posnMax, posnLen)) {
+// do something with the positions
+// }
+//
+// exists() returns T/F if mer exists or not
+// count() returns the number of times that mer is present
+
+// Define this to use an uncompressed hash table when the width is 32
+// bits or less. Doing so is A LOT faster in mismatch lookups, but
+// does use more memory.
+#undef UNCOMPRESS_HASH_TABLE
+
+// Define this to leave out references to getTime(), speedCounter()
+// and make the positionDB build very quietly.
+#undef SILENTPOSITIONDB
+
+// Define these to enable some debugging methods
+#undef DEBUGPOSDB
+#undef DEBUGREBUILD
+
+class existDB;
+class merylStreamReader;
+
+class positionDB {
+public:
+ positionDB(char const *filename,
+ uint32 merSize,
+ uint32 merSkip,
+ uint32 maxMismatch,
+ bool loadData=true);
+
+ positionDB(merStream *MS,
+ uint32 merSize,
+ uint32 merSkip,
+ existDB *mask,
+ existDB *only,
+ merylStreamReader *counts,
+ uint32 minCount,
+ uint32 maxCount,
+ uint32 maxMismatch,
+ uint32 maxMemory,
+ bool beVerbose);
+
+ ~positionDB();
+
+private:
+ void build(merStream *MS,
+ existDB *mask,
+ existDB *only,
+ merylStreamReader *counts,
+ uint32 minCount,
+ uint32 maxCount,
+ bool beVerbose);
+
+private:
+ void reallocateSpace(uint64*& posn,
+ uint64& posnMax,
+ uint64& posnLen,
+ uint64 len);
+
+ void loadPositions(uint64 v,
+ uint64*& posn,
+ uint64& posnMax,
+ uint64& posnLen,
+ uint64& count);
+
+public:
+ bool getExact(uint64 mer,
+ uint64*& posn,
+ uint64& posnMax,
+ uint64& posnLen,
+ uint64& count);
+ bool existsExact(uint64 mer);
+ uint64 countExact(uint64 mer);
+
+public:
+ void filter(uint64 lo, uint64 hi);
+
+private:
+ double setUpMismatchMatcher(uint32 nErrorsAllowed, uint64 approxMers);
+public:
+ bool getUpToNMismatches(uint64 mer,
+ uint32 maxMismatches,
+ uint64*& posn,
+ uint64& posnMax,
+ uint64& posnLen);
+private:
+ uint64 setCount(uint64 mer, uint64 count);
+
+ // Save or load a built table
+ //
+public:
+ void saveState(char const *filename);
+ bool loadState(char const *filename, bool beNoisy=false, bool loadData=true);
+
+ void printState(FILE *stream);
+
+ // Only really useful for debugging. Don't use.
+ //
+ void dump(char *name);
+
+
+ bool checkREBUILD(uint64 m) {
+#define DEBUGREBUILD
+#ifdef DEBUGREBUILD
+ uint64 h = HASH(m);
+ uint64 c = CHECK(m);
+ uint64 r = REBUILD(h, c);
+ if (r != m) {
+ fprintf(stderr, "shift1 = "uint32FMT"\n", _shift1);
+ fprintf(stderr, "shift2 = "uint32FMT"\n", _shift2);
+ fprintf(stderr, "M = "uint64HEX"\n", m);
+ fprintf(stderr, "H = "uint64HEX"\n", h);
+ fprintf(stderr, "C = "uint64HEX"\n", c);
+ fprintf(stderr, "R = "uint64HEX"\n", r);
+ return(false);
+ }
+ return(true);
+#else
+ return(REBUILD(HASH(m), CHECK(m)) == m);
+#endif
+ };
+
+private:
+
+ uint64 HASH(uint64 k) {
+ return(((k >> _shift1) ^ (k >> _shift2) ^ k) & _mask1);
+ };
+
+ uint64 CHECK(uint64 k) {
+ return(k & _mask2);
+ };
+
+ uint64 REBUILD(uint64 h, uint64 c) {
+ // Decode a HASH and a CHECK to get back the mer. You'd better
+ // bloody PRAY you don't break this (test/test-rebuild.C). It
+ // was a headache++ to write.
+
+ uint64 sha = _shift1 - _shift2;
+ uint64 msk = uint64MASK(sha);
+
+ // The check is exactly the mer....just not all there.
+ uint64 mer = c;
+
+ uint64 shf = sha - (_tableSizeInBits % 2);
+ uint64 shg = 0;
+ uint64 shh = _shift1;
+
+ // Unrolling this is troublesome - we still need the tests,
+ // bizarre merSize, tblSize combinations use lots of iterations
+ // (when the merSize and tblSize are about the same, the CHECK is
+ // small, and so we need to do lots of iterations).
+
+ //fprintf(stderr, "shf="uint64FMTW(2)" shg="uint64FMTW(2)" shh="uint64FMTW(2)" mer="uint64HEX"\n", shf, shg, shh, mer);
+
+ do {
+ mer |= (((h >> shg) ^ (mer >> shg) ^ (mer >> shf)) & msk) << shh;
+ //fprintf(stderr, "shf="uint64FMTW(2)" shg="uint64FMTW(2)" shh="uint64FMTW(2)" mer="uint64HEX"\n", shf, shg, shh, mer);
+
+ shf += sha;
+ shg += sha;
+ shh += sha;
+ } while ((shf < _merSizeInBits) && (shh < 64));
+
+ mer &= uint64MASK(_merSizeInBits);
+
+ return(mer);
+ };
+
+ void sortAndRepackBucket(uint64 b);
+
+ uint32 *_bucketSizes;
+ uint64 *_countingBuckets;
+ uint64 *_hashTable_BP; // Bit packed
+ uint32 *_hashTable_FW; // Full width
+ uint64 *_buckets;
+
+ uint64 *_positions;
+
+ uint32 _merSizeInBases;
+ uint32 _merSizeInBits;
+
+ uint32 _merSkipInBases;
+
+ uint64 _tableSizeInEntries;
+ uint32 _tableSizeInBits;
+
+ uint32 _hashWidth; // Hash bith
+ uint32 _chckWidth; // Check bits
+ uint32 _posnWidth; // Positions in the sequence
+ uint32 _pptrWidth; // Pointers to positions
+ uint32 _sizeWidth; // Extra number in the table
+
+ uint64 _hashMask;
+
+ uint32 _wCnt;
+ uint32 _wFin;
+
+ uint32 _shift1;
+ uint32 _shift2;
+ uint64 _mask1;
+ uint64 _mask2;
+
+ uint64 _numberOfMers;
+ uint64 _numberOfPositions;
+ uint64 _numberOfDistinct;
+ uint64 _numberOfUnique;
+ uint64 _numberOfEntries;
+ uint64 _maximumEntries;
+
+ // For sorting the mers
+ //
+ uint32 _sortedMax;
+ uint64 *_sortedChck;
+ uint64 *_sortedPosn;
+
+ // For the mismatch matcher
+ uint32 _nErrorsAllowed;
+ uint32 _hashedErrorsLen;
+ uint32 _hashedErrorsMax;
+ uint64 *_hashedErrors;
+};
+
+#endif // POSITIONDB_H
diff --git a/libkmer/test/Makefile b/libkmer/test/Makefile
new file mode 100644
index 0000000..d548a32
--- /dev/null
+++ b/libkmer/test/Makefile
@@ -0,0 +1,115 @@
+include ../../Make.compilers
+
+# Bigger tblsize makes existDB much faster, but uses more memory (not
+# much, really). 23 is nice.
+
+all: test-maskonly-passed position-passed
+ @echo "existDB has expensive tests. They take:"
+ @echo " 17 minutes on 1.8GHz Quadxeon (with KMER=1)"
+ @echo " 60 minutes on 2.8GHz P4 (with KMER=1)"
+ @echo " 120 minutes on 2.0GHz G5 (with KMER=8)"
+ @echo "If you really want to run them, do 'make exist-passed'."
+
+# ../../meryl/meryl -M equal 1 -s xp -o xp1
+# ../../meryl/meryl -Dt -n 1 -s xp1 > xp.uni.fasta
+
+# Dead code, removed.
+test-mertable:
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-mertable.o test-mertable.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-mertable test-mertable.o $(LIBS)
+ ../../leaff/leaff -G 1000 5000 8000 > xp.fasta
+ ./test-mertable xp.fasta
+ echo test-mertable PASSED
+ rm xp* junk*
+
+test-maskonly-passed:
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-maskonly.o test-maskonly.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-maskonly test-maskonly.o $(LIBS)
+ ../../leaff/leaff -G 1000 5000 8000 > xp.fasta
+ ../../meryl/meryl -B -f -m 14 -s xp.fasta -o xp
+ ../../meryl/meryl -Dt -n 2 -s xp > xp.dup.fasta
+ ./test-maskonly xp.fasta xp.dup.fasta xp.dup.fasta
+ echo test-maskonky-passed PASSED
+ rm xp* junk*
+ touch test-maskonly-passed
+
+test-rebuild: test-rebuild.C ../positionDB.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-rebuild.o test-rebuild.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-rebuild test-rebuild.o $(LIBS)
+ ./test-rebuild
+ @echo test-rebuild-passed PASSED
+ rm -f xp* junk*
+
+position-passed: position-passed1 position-passed2 exist-fast-passed
+ touch position-passed
+xp.fasta: ../../leaff/leaff
+ #../../leaff/leaff -G 1000 5000 8000 > xp.fasta
+ ../../leaff/leaff -G 1 500 800 > xp.fasta
+yp.fasta: ../../leaff/leaff
+ #../../leaff/leaff -G 100 500 1000 > yp.fasta
+ ../../leaff/leaff -G 1 50 100 > yp.fasta
+position-passed1: testerp xp.fasta
+ ./testerp -test1 xp.fasta
+ touch position-passed1
+position-passed2: testerp xp.fasta yp.fasta
+ ./testerp -test2 xp.fasta yp.fasta
+ touch position-passed2
+exist-fast-passed: ../existDB xe.fasta
+ ../existDB -mersize 14 -tblsize 21 -testfiles xe.fasta junk
+ ../existDB -mersize 14 -tblsize 21 -testexistence xe.fasta
+ rm -f xe.mcdat xe.mcidx
+ ../../meryl/meryl -B -f -m 14 -s xe.fasta -o xe
+ ../existDB -mersize 14 -tblsize 21 -testexhaustive xe.fasta xe
+ touch exist-fast-passed
+
+
+exist-passed: exist-passed1 exist-passed2 exist-passed3
+ touch exist-passed
+xe.fasta: ../../leaff/leaff
+ #../../leaff/leaff -G 1000 5000 8000 > xe.fasta
+ ../../leaff/leaff -G 1 500 800 > xe.fasta
+exist-passed1: ../existDB xe.fasta
+ ../existDB -mersize 17 -tblsize 23 -testfiles xe.fasta junk
+ rm -f junk*
+ touch exist-passed1
+exist-passed2: ../existDB xe.fasta
+ ../existDB -mersize 17 -tblsize 23 -testexistence xe.fasta
+ rm -f junk*
+ touch exist-passed2
+exist-passed3: ../existDB xe.fasta ../../meryl/meryl
+ rm -f xe.mcdat xe.mcidx
+ ../../meryl/meryl -B -f -m 17 -s xe.fasta -o xe
+ ../existDB -mersize 17 -tblsize 23 -testexhaustive xe.fasta xe
+ rm -f junk*
+ touch exist-passed3
+
+
+INCLUDE = -I../../libbio -I../../libseq -I../../libutil -I../../libmeryl -I..
+LIBS = -L../../libbio -L../../libseq -L../../libutil -L../../libmeryl -L.. -lkmer -lmeryl -lbio -lutil
+DBGOPT = -DERROR_CHECK_COUNTING -DERROR_CHECK_COUNTING_ENCODING -DERROR_CHECK_EMPTY_BUCKETS
+
+testerp: ../positionDB.C ../positionDB.H ../positionDB-access.C ../positionDB-dump.C ../positionDB-sort.C ../positionDB-file.C
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o driverp.o ../driver-posDB.C $(INCLUDE)
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o positionDB.o ../positionDB.C $(INCLUDE)
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o positionDB-access.o ../positionDB-access.C $(INCLUDE)
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o positionDB-dump.o ../positionDB-dump.C $(INCLUDE)
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o positionDB-sort.o ../positionDB-sort.C $(INCLUDE)
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o positionDB-file.o ../positionDB-file.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o testerp driverp.o positionDB.o positionDB-access.o positionDB-dump.o positionDB-sort.o positionDB-file.o $(LIBS) -lm
+
+# XXX: There isn't any reason we need to build testere, we could
+# just use ../existDB (as it did before!)
+testere: ../existDB.C ../existDB-create-from-fasta.C ../existDB-create-from-meryl.C ../existDB-state.C
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o drivere.o ../driver-existDB.C $(INCLUDE)
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o existDB.o ../existDB.C $(INCLUDE)
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o existDB-create-from-fasta.o ../existDB-create-from-fasta.C $(INCLUDE)
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o existDB-create-from-meryl.o ../existDB-create-from-meryl.C $(INCLUDE)
+ $(CXX) $(DBGOPT) $(CXXFLAGS_COMPILE) -c -o existDB-state.o ../existDB-state.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o testere drivere.o existDB.o existDB-create-from-fasta.o existDB-create-from-meryl.o existDB-state.o $(LIBS) -lm
+
+clean:
+ rm -f *passed*
+ rm -f testerp *.o xp.fasta* yp.fasta*
+ rm -f testere junk* xe.fasta* xe.mcidx xe.mcdat xe.merStream
+ rm -f test-maskonly xp.dup.fasta xp.mcidx xp.mcdat
+ rm -f test-rebuild
diff --git a/libkmer/test/test-maskonly.C b/libkmer/test/test-maskonly.C
new file mode 100644
index 0000000..0acfa54
--- /dev/null
+++ b/libkmer/test/test-maskonly.C
@@ -0,0 +1,110 @@
+#include "bio++.H"
+#include "existDB.H"
+#include "positionDB.H"
+
+// Tests a positionDB when using an existDB for masking.
+//
+// existDB can be either include or exclude
+// positionDB can use include, exclude or threshold
+//
+
+#define MERSIZE 14
+
+int
+main(int argc, char **argv) {
+ existDB *include;
+ existDB *exclude;
+ positionDB *full;
+ positionDB *incl;
+ positionDB *excl;
+ positionDB *thrs;
+
+ if (argc != 4) {
+ fprintf(stderr, "usage: %s seq.fasta mask.fasta incl.fasta\n", argv[0]);
+ exit(1);
+ }
+
+ char *seqName = argv[1];
+ char *mskName = argv[2];
+ char *incName = argv[3];
+
+ fprintf(stderr, "BUILDING EXCLUDE\n");
+ exclude = new existDB(mskName, MERSIZE, existDBnoFlags, uint32ZERO, ~uint32ZERO);
+
+ fprintf(stderr, "BUILDING INCLUDE\n");
+ include = new existDB(incName, MERSIZE, existDBnoFlags, uint32ZERO, ~uint32ZERO);
+
+ seqStream *F = new seqStream(seqName, true);
+ merStream *T = new merStream(new kMerBuilder(MERSIZE), F);
+
+ fprintf(stderr, "BUILDING FULL\n");
+ full = new positionDB(T, MERSIZE, 0, 0L, 0L, 0L, 0, 0, 0, 0, true);
+ full->saveState("junk-full");
+ delete full;
+
+ fprintf(stderr, "BUILDING INCL\n");
+ incl = new positionDB(T, MERSIZE, 0, 0L, include, 0L, 0, 0, 0, 0, true);
+ incl->saveState("junk-incl");
+ delete incl;
+
+ fprintf(stderr, "BUILDING EXCL\n");
+ excl = new positionDB(T, MERSIZE, 0, exclude, 0L, 0L, 0, 0, 0, 0, true);
+ excl->saveState("junk-excl");
+ delete excl;
+
+ fprintf(stderr, "BUILDING THRS\n");
+ thrs = new positionDB(T, MERSIZE, 0, 0L, 0L, 0L, 1, 0, 0, 0, true);
+ thrs->saveState("junk-thrs");
+ delete thrs;
+
+ full = new positionDB("junk-full", MERSIZE, 0, 0);
+ incl = new positionDB("junk-incl", MERSIZE, 0, 0);
+ excl = new positionDB("junk-excl", MERSIZE, 0, 0);
+ thrs = new positionDB("junk-thrs", MERSIZE, 0, 0);
+
+ char themer[1000];
+ uint32 mernum = 0;
+
+ uint32 err = 0;
+
+ // Check everything looks ok
+ T->rewind();
+ while (T->nextMer()) {
+
+ if (!full->existsExact(T->theFMer())) {
+ fprintf(stderr, "Didn't find mer "uint32FMT" %s in full.\n", mernum, T->theFMer().merToString(themer));
+ err++;
+ }
+
+ if (include->exists(T->theFMer())) {
+ if (!incl->existsExact(T->theFMer())) {
+ fprintf(stderr, "Didn't find mer "uint32FMT" %s in incl.\n", mernum, T->theFMer().merToString(themer));
+ err++;
+ }
+ } else {
+ if (incl->existsExact(T->theFMer())) {
+ fprintf(stderr, "Found extra mer "uint32FMT" %s in incl.\n", mernum, T->theFMer().merToString(themer));
+ err++;
+ }
+ }
+
+ if (exclude->exists(T->theFMer())) {
+ if (excl->existsExact(T->theFMer())) {
+ fprintf(stderr, "Found extra mer "uint32FMT" %s in excl.\n", mernum, T->theFMer().merToString(themer));
+ err++;
+ }
+ } else {
+ if (!excl->existsExact(T->theFMer())) {
+ fprintf(stderr, "Didn't find mer "uint32FMT" %s in excl.\n", mernum, T->theFMer().merToString(themer));
+ err++;
+ }
+ }
+
+ mernum++;
+ }
+
+ delete T;
+ delete F;
+
+ exit(err > 0);
+}
diff --git a/libkmer/test/test-mertable.C b/libkmer/test/test-mertable.C
new file mode 100644
index 0000000..2d40603
--- /dev/null
+++ b/libkmer/test/test-mertable.C
@@ -0,0 +1,15 @@
+#include "bio++.H"
+#include "merTable.H"
+
+
+int
+main(int argc, char **argv) {
+ merTable X;
+
+ chainedSequence *CS = new chainedSequence();
+ CS->setSource(argv[1]);
+ CS->finish();
+
+ X.build(CS, 8);
+}
+
diff --git a/libkmer/test/test-rebuild.C b/libkmer/test/test-rebuild.C
new file mode 100644
index 0000000..7321dbf
--- /dev/null
+++ b/libkmer/test/test-rebuild.C
@@ -0,0 +1,50 @@
+#include "bio++.H"
+#include "existDB.H"
+#include "positionDB.H"
+
+// Tests a positionDB when using an existDB for masking.
+//
+// existDB can be either include or exclude
+// positionDB can use include, exclude or threshold
+//
+
+int
+main(int argc, char **argv) {
+
+ uint64 maxMers = uint64ONE << 25;
+
+ for (uint32 merSize=8; merSize<33; merSize++) {
+ fprintf(stderr, "Testing "uint64FMT" Mmers at merSize "uint32FMT".\n", maxMers, merSize);
+
+ kMerBuilder *K = new kMerBuilder(merSize);
+ merStream *T = new merStream(K, "acgcgactcgagctacgagcgatcacgacgactacgagca", 40);
+ positionDB *P = new positionDB(T, merSize, 0, 0L, 0L, 0L, 0, 0, false, true);
+
+ uint64 p = 0;
+ uint64 f = 0;
+
+ mt_s *mts = mtInit(3492);
+ uint64 msk = uint64MASK(2*merSize);
+ uint64 cnt = maxMers;
+
+ while (cnt--) {
+ if (P->checkREBUILD(mtRandom64(mts) & msk) == false) {
+ f++;
+ } else {
+ p++;
+ }
+ }
+
+ if (f) {
+ fprintf(stderr, "PASS: "uint64FMT" FAIL: "uint64FMT"\n", p, f);
+ exit(1);
+ }
+
+ free(mts);
+
+ delete P;
+ delete T;
+ }
+
+ exit(0);
+}
diff --git a/libmeryl/Make.include b/libmeryl/Make.include
new file mode 100644
index 0000000..fb6b89b
--- /dev/null
+++ b/libmeryl/Make.include
@@ -0,0 +1,15 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+
+$/.CXX_SRCS := $/libmeryl.C
+$/.CXX_INCS := $/libmeryl.H
+$/.CXX_LIBS := $/libmeryl.a
+$/.CLEAN := $/*.o
+
+$/libmeryl.a : $/libmeryl.o
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
+
diff --git a/libmeryl/libmeryl.C b/libmeryl/libmeryl.C
new file mode 100644
index 0000000..8fd7cb4
--- /dev/null
+++ b/libmeryl/libmeryl.C
@@ -0,0 +1,490 @@
+#include "libmeryl.H"
+
+#define LIBMERYL_HISTOGRAM_MAX 1048576
+
+// 0123456789012345
+static char *ImagicV = "merylStreamIv03\n";
+static char *ImagicX = "merylStreamIvXX\n";
+static char *DmagicV = "merylStreamDv03\n";
+static char *DmagicX = "merylStreamDvXX\n";
+static char *PmagicV = "merylStreamPv03\n";
+static char *PmagicX = "merylStreamPvXX\n";
+
+merylStreamReader::merylStreamReader(const char *fn, uint32 ms) {
+
+ if (fn == 0L) {
+ fprintf(stderr, "ERROR - no counted database file specified.\n");
+ exit(1);
+ }
+
+ // Open the files
+ //
+ char *inpath = new char [strlen(fn) + 8];
+
+ sprintf(inpath, "%s.mcidx", fn);
+ _IDX = new bitPackedFile(inpath);
+
+ sprintf(inpath, "%s.mcdat", fn);
+ _DAT = new bitPackedFile(inpath);
+
+ sprintf(inpath, "%s.mcpos", fn);
+ if (fileExists(inpath))
+ _POS = new bitPackedFile(inpath);
+ else
+ _POS = 0L;
+
+ delete [] inpath;
+
+ // Verify that they are what they should be, and read in the header
+ //
+ char Imagic[16] = {0};
+ char Dmagic[16] = {0};
+ char Pmagic[16] = {0};
+ bool fail = false;
+
+ for (uint32 i=0; i<16; i++) {
+ Imagic[i] = _IDX->getBits(8);
+ Dmagic[i] = _DAT->getBits(8);
+ if (_POS)
+ Pmagic[i] = _POS->getBits(8);
+ }
+ if (strncmp(Imagic, ImagicX, 16) == 0) {
+ fprintf(stderr, "merylStreamReader()-- ERROR: %s.mcidx is an INCOMPLETE merylStream index file!\n", fn);
+ fail = true;
+ }
+ if (strncmp(Imagic, ImagicX, 13) != 0) {
+ fprintf(stderr, "merylStreamReader()-- ERROR: %s.mcidx is not a merylStream index file!\n", fn);
+ fail = true;
+ }
+ if (strncmp(Dmagic, DmagicX, 16) == 0) {
+ fprintf(stderr, "merylStreamReader()-- ERROR: %s.mcdat is an INCOMPLETE merylStream data file!\n", fn);
+ fail = true;
+ }
+ if (strncmp(Dmagic, DmagicX, 13) != 0) {
+ fprintf(stderr, "merylStreamReader()-- ERROR: %s.mcdat is not a merylStream data file!\n", fn);
+ fail = true;
+ }
+ if ((Imagic[13] != Dmagic[13]) ||
+ (Imagic[14] != Dmagic[14])) {
+ fprintf(stderr, "merylStreamReader()-- ERROR: %s.mcidx and %s.mcdat are different versions!\n", fn, fn);
+ fail = true;
+ }
+#warning not checking pmagic
+
+ if (fail)
+ exit(1);
+
+ _idxIsPacked = _IDX->getBits(32);
+ _datIsPacked = _IDX->getBits(32);
+ _posIsPacked = _IDX->getBits(32);
+
+ _merSizeInBits = _IDX->getBits(32) << 1;
+ _merCompression = _IDX->getBits(32);
+ _prefixSize = _IDX->getBits(32);
+ _merDataSize = _merSizeInBits - _prefixSize;
+
+ _numUnique = _IDX->getBits(64);
+ _numDistinct = _IDX->getBits(64);
+ _numTotal = _IDX->getBits(64);
+
+ _histogramHuge = 0;
+ _histogramLen = 0;
+ _histogramMaxValue = 0;
+ _histogram = 0L;
+
+ uint32 version = atoi(Imagic + 13);
+
+ if (version > 1) {
+ _histogramHuge = _IDX->getBits(64);
+ _histogramLen = _IDX->getBits(64);
+ _histogramMaxValue = _IDX->getBits(64);
+ _histogram = new uint64 [_histogramLen];
+
+ for (uint32 i=0; i<_histogramLen; i++)
+ _histogram[i] = _IDX->getBits(64);
+ }
+
+ _thisBucket = uint64ZERO;
+ _thisBucketSize = getIDXnumber();
+ _numBuckets = uint64ONE << _prefixSize;
+
+ _thisMer.setMerSize(_merSizeInBits >> 1);
+ _thisMer.clear();
+ _thisMerCount = uint64ZERO;
+
+ _thisMerPositionsMax = 0;
+ _thisMerPositions = 0L;
+
+ _validMer = true;
+
+#ifdef SHOW_VARIABLES
+ fprintf(stderr, "_merSizeInBits = "uint32FMT"\n", _merSizeInBits);
+ fprintf(stderr, "_merCompression = "uint32FMT"\n", _merCompression);
+ fprintf(stderr, "_prefixSize = "uint32FMT"\n", _prefixSize);
+ fprintf(stderr, "_merDataSize = "uint32FMT"\n", _merDataSize);
+ fprintf(stderr, "_numUnique = "uint64FMT"\n", _numUnique);
+ fprintf(stderr, "_numDistinct = "uint64FMT"\n", _numDistinct);
+ fprintf(stderr, "_numTotal = "uint64FMT"\n", _numTotal);
+ fprintf(stderr, "_thisBucket = "uint64FMT"\n", _thisBucket);
+ fprintf(stderr, "_thisBucketSize = "uint64FMT"\n", _thisBucketSize);
+ fprintf(stderr, "_thisMerCount = "uint64FMT"\n", _thisMerCount);
+#endif
+
+ if ((ms > 0) && (_merSizeInBits >> 1 != ms)) {
+ fprintf(stderr, "merylStreamReader()-- ERROR: User requested mersize "uint32FMT" but '%s' is mersize "uint32FMT"\n",
+ ms, fn, _merSizeInBits >> 1);
+ exit(1);
+ }
+}
+
+
+merylStreamReader::~merylStreamReader() {
+ delete _IDX;
+ delete _DAT;
+ delete _POS;
+ delete [] _thisMerPositions;
+ delete [] _histogram;
+}
+
+
+
+bool
+merylStreamReader::nextMer(void) {
+
+ // Use a while here, so that we skip buckets that are empty
+ //
+ while ((_thisBucketSize == 0) && (_thisBucket < _numBuckets)) {
+ _thisBucketSize = getIDXnumber();
+ _thisBucket++;
+ }
+
+ if (_thisBucket >= _numBuckets)
+ return(_validMer = false);
+
+ // Before you get rid of the clear() -- if, say, the list of mers
+ // is sorted and we can shift the mer to make space for the new
+ // stuff -- make sure that nobody is calling reverseComplement()!
+ //
+ _thisMer.clear();
+ _thisMer.readFromBitPackedFile(_DAT, _merDataSize);
+ _thisMer.setBits(_merDataSize, _prefixSize, _thisBucket);
+
+ _thisMerCount = getDATnumber();
+
+ _thisBucketSize--;
+
+ if (_POS) {
+ if (_thisMerPositionsMax < _thisMerCount) {
+ delete [] _thisMerPositions;
+ _thisMerPositionsMax = _thisMerCount + 1024;
+ _thisMerPositions = new uint32 [_thisMerPositionsMax];
+ }
+ for (uint32 i=0; i<_thisMerCount; i++) {
+ _thisMerPositions[i] = _POS->getBits(32);
+ }
+ }
+
+ return(true);
+}
+
+
+
+
+
+
+merylStreamWriter::merylStreamWriter(const char *fn,
+ uint32 merSize,
+ uint32 merComp,
+ uint32 prefixSize,
+ bool positionsEnabled) {
+
+ char *outpath = new char [strlen(fn) + 17];
+
+ sprintf(outpath, "%s.mcidx", fn);
+ _IDX = new bitPackedFile(outpath, 0, true);
+
+ sprintf(outpath, "%s.mcdat", fn);
+ _DAT = new bitPackedFile(outpath, 0, true);
+
+ if (positionsEnabled) {
+ sprintf(outpath, "%s.mcpos", fn);
+ _POS = new bitPackedFile(outpath, 0, true);
+ } else {
+ _POS = 0L;
+ }
+
+ delete [] outpath;
+
+ // Save really important stuff
+
+ // unpacked --> write 0.42M mers/sec on 8 threads, merge 3.3M mers/sec
+ // packed --> write 0.77M mers/sec on 8 threads, merge 3.9M mers/sec
+ //
+ // This sucks.
+ //
+ _idxIsPacked = 1;
+ _datIsPacked = 1;
+ _posIsPacked = 0;
+
+ _merSizeInBits = merSize * 2;
+ _merCompression = merComp;
+ _prefixSize = prefixSize;
+ _merDataSize = _merSizeInBits - _prefixSize;
+
+ _thisBucket = uint64ZERO;
+ _thisBucketSize = uint64ZERO;
+ _numBuckets = uint64ONE << _prefixSize;
+
+ _numUnique = uint64ZERO;
+ _numDistinct = uint64ZERO;
+ _numTotal = uint64ZERO;
+
+ _thisMerIsBits = false;
+ _thisMerIskMer = false;
+
+ _thisMer.setMerSize(_merSizeInBits >> 1);
+ _thisMer.clear();
+
+ _thisMerPre = uint64ZERO;
+ _thisMerMer = uint64ZERO;
+
+ _thisMerPreSize = prefixSize;
+ _thisMerMerSize = 2 * merSize - prefixSize;
+
+ _thisMerCount = uint64ZERO;
+
+ for (uint32 i=0; i<16; i++)
+ _IDX->putBits(ImagicX[i], 8);
+
+ _IDX->putBits(_idxIsPacked, 32);
+ _IDX->putBits(_datIsPacked, 32);
+ _IDX->putBits(_posIsPacked, 32);
+
+ _IDX->putBits(_merSizeInBits >> 1, 32);
+ _IDX->putBits(_merCompression, 32);
+ _IDX->putBits(_prefixSize, 32);
+ _IDX->putBits(_numUnique, 64);
+ _IDX->putBits(_numDistinct, 64);
+ _IDX->putBits(_numTotal, 64);
+
+ _histogramHuge = 0;
+ _histogramLen = LIBMERYL_HISTOGRAM_MAX;
+ _histogramMaxValue = 0;
+ _histogram = new uint64 [_histogramLen];
+
+ for (uint32 i=0; i<_histogramLen; i++)
+ _histogram[i] = 0;
+
+ _IDX->putBits(_histogramHuge, 64);
+ _IDX->putBits(_histogramLen, 64);
+ _IDX->putBits(_histogramMaxValue, 64);
+ for (uint32 i=0; i<_histogramLen; i++)
+ _IDX->putBits(_histogram[i], 64);
+
+ for (uint32 i=0; i<16; i++)
+ _DAT->putBits(DmagicX[i], 8);
+
+ if (_POS)
+ for (uint32 i=0; i<16; i++)
+ _POS->putBits(PmagicX[i], 8);
+}
+
+
+merylStreamWriter::~merylStreamWriter() {
+
+ writeMer();
+
+ // Finish writing the buckets.
+ //
+ while (_thisBucket < _numBuckets + 2) {
+ setIDXnumber(_thisBucketSize);
+ _thisBucketSize = 0;
+ _thisBucket++;
+ }
+
+ // Seek back to the start and rewrite the magic numbers
+ //
+ _IDX->seek(0);
+ _DAT->seek(0);
+
+ for (uint32 i=0; i<16; i++)
+ _IDX->putBits(ImagicV[i], 8);
+
+ _IDX->putBits(_idxIsPacked, 32);
+ _IDX->putBits(_datIsPacked, 32);
+ _IDX->putBits(_posIsPacked, 32);
+
+ _IDX->putBits(_merSizeInBits >> 1, 32);
+ _IDX->putBits(_merCompression, 32);
+ _IDX->putBits(_prefixSize, 32);
+ _IDX->putBits(_numUnique, 64);
+ _IDX->putBits(_numDistinct, 64);
+ _IDX->putBits(_numTotal, 64);
+
+ _IDX->putBits(_histogramHuge, 64);
+ _IDX->putBits(_histogramLen, 64);
+ _IDX->putBits(_histogramMaxValue, 64);
+ for (uint32 i=0; i<_histogramLen; i++)
+ _IDX->putBits(_histogram[i], 64);
+ delete _IDX;
+
+ delete [] _histogram;
+
+ for (uint32 i=0; i<16; i++)
+ _DAT->putBits(DmagicV[i], 8);
+ delete _DAT;
+
+ if (_POS) {
+ for (uint32 i=0; i<16; i++)
+ _POS->putBits(PmagicV[i], 8);
+ delete _POS;
+ }
+}
+
+
+void
+merylStreamWriter::writeMer(void) {
+
+ if (_thisMerCount == 0)
+ return;
+
+ _numTotal += _thisMerCount;
+ _numDistinct++;
+
+ if (_thisMerCount < LIBMERYL_HISTOGRAM_MAX)
+ _histogram[_thisMerCount]++;
+ else
+ _histogramHuge++;
+ if (_histogramMaxValue < _thisMerCount)
+ _histogramMaxValue = _thisMerCount;
+
+ assert((_thisMerIsBits == false) || (_thisMerIskMer == false));
+
+ if (_thisMerIsBits) {
+ if (_thisMerCount == 1) {
+ _DAT->putBits(_thisMerMer, _thisMerMerSize);
+ setDATnumber(1);
+ _thisBucketSize++;
+ _numUnique++;
+ } else {
+ _DAT->putBits(_thisMerMer, _thisMerMerSize);
+ setDATnumber(_thisMerCount);
+ _thisBucketSize++;
+ }
+
+ } else {
+ if (_thisMerCount == 1) {
+ _thisMer.writeToBitPackedFile(_DAT, _merDataSize);
+ setDATnumber(1);
+ _thisBucketSize++;
+ _numUnique++;
+ } else if (_thisMerCount > 1) {
+ _thisMer.writeToBitPackedFile(_DAT, _merDataSize);
+ setDATnumber(_thisMerCount);
+ _thisBucketSize++;
+ }
+ }
+}
+
+
+
+void
+merylStreamWriter::addMer(kMer &mer, uint32 count, uint32 *positions) {
+ uint64 val;
+
+ if (_thisMerIskMer == false) {
+ _thisMerIskMer = true;
+ assert(_thisMerIsBits == false);
+ }
+
+ // Fail if we see a smaller mer than last time.
+ //
+ if (mer < _thisMer) {
+ char str[1024];
+ fprintf(stderr, "merylStreamWriter::addMer()-- ERROR: your mer stream isn't sorted increasingly!\n");
+ fprintf(stderr, "merylStreamWriter::addMer()-- last: %s\n", _thisMer.merToString(str));
+ fprintf(stderr, "merylStreamWriter::addMer()-- this: %s\n", mer.merToString(str));
+ exit(1);
+ }
+
+ // If there was a position given, write it.
+ //
+ if (positions && _POS)
+ for (uint32 i=0; i<count; i++)
+ _POS->putBits(positions[i], 32);
+
+ // If the new mer is the same as the last one just increase the
+ // count.
+ //
+ if (mer == _thisMer) {
+ _thisMerCount += count;
+ return;
+ }
+
+ // Write thisMer to disk. If the count is zero, we don't write
+ // anything. The count is zero for the first mer (all A) unless we
+ // add that mer, and if the silly user gives us a mer with zero
+ // count.
+ //
+ writeMer();
+
+ // If the new mer is in a different bucket from the last mer, write
+ // out some bucket counts. We need a while loop (opposed to just
+ // writing one bucket) because we aren't guaranteed that the mers
+ // are in adjacent buckets.
+ //
+ val = mer.startOfMer(_prefixSize);
+
+ while (_thisBucket < val) {
+ setIDXnumber(_thisBucketSize);
+ _thisBucketSize = 0;
+ _thisBucket++;
+ }
+
+ // Remember the new mer for the next time
+ //
+ _thisMer = mer;
+ _thisMerCount = count;
+}
+
+
+
+void
+merylStreamWriter::addMer(uint64 prefix, uint32 prefixBits,
+ uint64 mer, uint32 merBits,
+ uint32 count,
+ uint32 *positions) {
+
+ if (_thisMerIsBits == false) {
+ _thisMerIsBits = true;
+ assert(_thisMerIskMer == false);
+ }
+
+ assert(prefixBits == _prefixSize);
+ assert(prefixBits == _thisMerPreSize);
+ assert(merBits == _thisMerMerSize);
+ assert(prefixBits + merBits == _merSizeInBits);
+
+ if ((prefix < _thisMerPre) ||
+ (prefix <= _thisMerPre) && (mer < _thisMerMer)) {
+ assert(0);
+ }
+
+ if ((prefix == _thisMerPre) &&
+ (mer == _thisMerMer)) {
+ _thisMerCount += count;
+ return;
+ }
+
+ writeMer();
+
+ while (_thisBucket < prefix) {
+ setIDXnumber(_thisBucketSize);
+ _thisBucketSize = 0;
+ _thisBucket++;
+ }
+
+ _thisMerPre = prefix;
+ _thisMerMer = mer;
+ _thisMerCount = count;
+}
diff --git a/libmeryl/libmeryl.H b/libmeryl/libmeryl.H
new file mode 100644
index 0000000..6c8cda8
--- /dev/null
+++ b/libmeryl/libmeryl.H
@@ -0,0 +1,185 @@
+#ifndef LIBMERYL_H
+#define LIBMERYL_H
+
+#include "bio++.H"
+
+// A merStream reader/writer for meryl mercount data.
+//
+// merSize is used to check that the meryl file is the correct size.
+// If it isn't the code fails.
+//
+// The reader returns mers in lexicographic order. No random access.
+// The writer assumes that mers come in sorted increasingly.
+//
+// numUnique the total number of mers with count of one
+// numDistinct the total number of distinct mers in this file
+// numTotal the total number of mers in this file
+
+
+class merylStreamReader {
+public:
+ merylStreamReader(const char *fn, uint32 ms=0);
+ ~merylStreamReader();
+
+ kMer &theFMer(void) { return(_thisMer); };
+ uint64 theCount(void) { return(_thisMerCount); };
+
+ bool hasPositions(void) { return(_POS != 0L); };
+ uint32 *thePositions(void) { return(_thisMerPositions); };
+ uint32 getPosition(uint32 i) { return(((_POS) && (i < _thisMerCount)) ? _thisMerPositions[i] : ~uint32ZERO); };
+
+ uint32 merSize(void) { return(_merSizeInBits >> 1); };
+ uint32 merCompression(void) { return(_merCompression); };
+
+ uint32 prefixSize(void) { return(_prefixSize); };
+
+ uint64 numberOfUniqueMers(void) { return(_numUnique); };
+ uint64 numberOfDistinctMers(void) { return(_numDistinct); };
+ uint64 numberOfTotalMers(void) { return(_numTotal); };
+
+ uint64 histogram(uint32 i) { return((i < _histogramLen) ? _histogram[i] : ~uint64ZERO); };
+ uint64 histogramLength(void) { return(_histogramLen); };
+ uint64 histogramHuge(void) { return(_histogramHuge); };
+ uint64 histogramMaximumCount(void) { return(_histogramMaxValue); };
+
+ bool nextMer(void);
+ bool validMer(void) { return(_validMer); };
+private:
+ bitPackedFile *_IDX;
+ bitPackedFile *_DAT;
+ bitPackedFile *_POS;
+
+ uint64 getIDXnumber(void) {
+ uint64 n = 1;
+
+ if (_idxIsPacked)
+ n = _IDX->getNumber();
+ else
+ n = _IDX->getBits(32);
+
+ return(n);
+ };
+ uint64 getDATnumber(void) {
+ uint64 n = 1;
+
+ if (_datIsPacked) {
+ if (_DAT->getBits(1))
+ n = _DAT->getNumber() + 2;
+ } else {
+ n = _DAT->getBits(32);
+ }
+
+ return(n);
+ };
+
+ // Why not bool? Seems like the bitPackedFile is incompatible
+ // with bools.
+ uint32 _idxIsPacked;
+ uint32 _datIsPacked;
+ uint32 _posIsPacked;
+
+ uint32 _merSizeInBits;
+ uint32 _merCompression;
+ uint32 _prefixSize;
+ uint32 _merDataSize;
+ uint64 _thisBucket;
+ uint64 _thisBucketSize;
+ uint64 _numBuckets;
+
+ kMer _thisMer;
+ uint64 _thisMerCount;
+
+ uint32 _thisMerPositionsMax;
+ uint32 *_thisMerPositions;
+
+ uint64 _numUnique;
+ uint64 _numDistinct;
+ uint64 _numTotal;
+
+ uint64 _histogramHuge; // number that are bigger than Len
+ uint64 _histogramLen; // number of entries in the histo
+ uint64 _histogramMaxValue; // highest count ever seen
+ uint64 *_histogram;
+
+ bool _validMer;
+};
+
+
+class merylStreamWriter {
+public:
+ merylStreamWriter(const char *filePrefix,
+ uint32 merSize, // In bases
+ uint32 merComp, // A length, bases
+ uint32 prefixSize, // In bits
+ bool positionsEnabled);
+ ~merylStreamWriter();
+
+ void addMer(kMer &mer, uint32 count=1, uint32 *positions=0L);
+ void addMer(uint64 prefix, uint32 prefixBits,
+ uint64 mer, uint32 merBits,
+ uint32 count=1,
+ uint32 *positions=0L);
+
+private:
+ void writeMer(void);
+
+ void setIDXnumber(uint64 n) {
+ if (_idxIsPacked)
+ _IDX->putNumber(n);
+ else
+ _IDX->putBits(n, 32);
+ };
+ void setDATnumber(uint64 n) {
+ if (_datIsPacked) {
+ if (n == 1) {
+ _DAT->putBits(uint64ZERO, 1);
+ } else {
+ _DAT->putBits(uint64ONE, 1);
+ _DAT->putNumber(n-2);
+ }
+ } else {
+ _DAT->putBits(n, 32);
+ }
+ };
+
+
+ bitPackedFile *_IDX;
+ bitPackedFile *_DAT;
+ bitPackedFile *_POS;
+
+ uint32 _idxIsPacked;
+ uint32 _datIsPacked;
+ uint32 _posIsPacked;
+
+ uint32 _merSizeInBits;
+ uint32 _merCompression;
+ uint32 _prefixSize;
+ uint32 _merDataSize;
+ uint64 _thisBucket;
+ uint64 _thisBucketSize;
+ uint64 _numBuckets;
+
+ uint64 _numUnique;
+ uint64 _numDistinct;
+ uint64 _numTotal;
+
+ uint64 _histogramHuge; // number that are bigger than Len
+ uint64 _histogramLen; // number of entries in the histo
+ uint64 _histogramMaxValue; // highest count ever seen
+ uint64 *_histogram;
+
+ bool _thisMerIsBits;
+ bool _thisMerIskMer;
+
+ kMer _thisMer;
+
+ uint64 _thisMerPre;
+ uint64 _thisMerMer;
+
+ uint32 _thisMerPreSize;
+ uint32 _thisMerMerSize;
+
+ uint64 _thisMerCount;
+};
+
+#endif // LIBMERYL_H
diff --git a/libseq/Make.include b/libseq/Make.include
new file mode 100644
index 0000000..3b87034
--- /dev/null
+++ b/libseq/Make.include
@@ -0,0 +1,34 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+
+src := $/seqFile.H \
+ $/fastaFile.H $/fastaFile.C \
+ $/fastaStdin.H $/fastaStdin.C \
+ $/fastqFile.H $/fastqFile.C \
+ $/fastqStdin.H $/fastqStdin.C \
+ $/seqStore.H $/seqStore.C \
+ $/sffFile.H $/sffFile.C \
+ $/seqFactory.H $/seqFactory.C \
+ $/seqCache.H $/seqCache.C \
+ $/seqStream.H $/seqStream.C \
+ $/merStream.H $/merStream.C
+
+$/.CXX_SRCS :=$(filter %.C,${src}) $/test-seqCache.C $/test-seqStream.C $/test-merStream.C
+$/.CXX_INCS :=$(filter %.H,${src})
+$/.CXX_EXES :=$/test-seqCache $/test-seqStream $/test-merStream
+$/.CXX_LIBS :=$/libseq.a
+
+$/.CLEAN := $/*.o
+
+$/libseq.a : ${$/.C_SRCS:.c=.o} ${$/.CXX_SRCS:.C=.o}
+
+$/test-seqCache : $/test-seqCache.o ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/test-seqStream : $/test-seqStream.o ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/test-merStream : $/test-merStream.o ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CFLAGS += -I${LIBUTL/} -I${LIBBIO/} -I${LIBSEQ/})
+$(eval $/%.d $/%.o: CXXFLAGS += -I${LIBUTL/} -I${LIBBIO/} -I${LIBSEQ/})
+
diff --git a/libseq/fastaFile.C b/libseq/fastaFile.C
new file mode 100644
index 0000000..a55bf38
--- /dev/null
+++ b/libseq/fastaFile.C
@@ -0,0 +1,585 @@
+#include "fastaFile.H"
+#include "alphabet.h"
+
+
+#undef DEBUG
+#undef DEBUGINDEX
+
+// Says 'kmerFastaFileIdx'
+#define FASTA_MAGICNUMBER1 0x7473614672656d6bULL
+#define FASTA_MAGICNUMBER2 0x786449656c694661ULL
+
+
+fastaFile::fastaFile(const char *filename) {
+ clear();
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaFile::fastaFile()-- '%s'\n", (filename) ? filename : "NULLPOINTER");
+#endif
+
+ strcpy(_filename, filename);
+
+ constructIndex();
+
+ _rb = new readBuffer(_filename);
+
+ _numberOfSequences = _header._numberOfSequences;
+}
+
+
+
+fastaFile::fastaFile() {
+ clear();
+}
+
+
+
+fastaFile::~fastaFile() {
+ delete _rb;
+ delete [] _index;
+ delete [] _names;
+}
+
+
+
+
+
+seqFile *
+fastaFile::openFile(const char *filename) {
+ struct stat st;
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaFile::openFile()-- '%s'\n", (filename) ? filename : "NULLPOINTER");
+#endif
+
+ if (((filename == 0L) && (isatty(fileno(stdin)) == 0)) ||
+ ((filename != 0L) && (filename[0] == '-') && (filename[1] == 0)))
+ return(0L);
+
+ errno = 0;
+ stat(filename, &st);
+ if (errno)
+ return(0L);
+ if ((st.st_mode & S_IFREG) == 0)
+ return(0L);
+
+ // Otherwise, open and see if we can get the first sequence. We
+ // assume it's fasta if we find a '>' denoting a defline the first
+ // thing in the file.
+ //
+ // Use of a readBuffer here is a bit heavyweight, but it's safe and
+ // easy. Opening a fastaFile isn't, after all, lightweight anyway.
+ //
+ fastaFile *f = 0L;
+ readBuffer *r = new readBuffer(filename);
+ char x = r->read();
+
+ while ((r->eof() == false) && (whitespaceSymbol[x] == true))
+ x = r->read();
+
+ // If we get a fasta record separator assume it's a fasta file. If
+ // it's eof, the file is empty, and we might as well return this
+ // fasta file and let the client deal with the lack of sequence.
+ //
+ if ((x == '>') || (r->eof() == true))
+ f = new fastaFile(filename);
+
+ delete r;
+
+ return(f);
+}
+
+
+
+uint32
+fastaFile::find(const char *sequencename) {
+ char *ptr = _names;
+
+ // If this proves far too slow, rewrite the _names string to
+ // separate IDs with 0xff, then use strstr on the whole thing. To
+ // find the ID, scan down the string counting the number of 0xff's.
+ //
+ // Similar code is used for seqStore::find()
+
+ for (uint32 iid=0; iid < _header._numberOfSequences; iid++) {
+ //fprintf(stderr, "fastaFile::find()-- '%s' vs '%s'\n", sequencename, ptr);
+ if (strcmp(sequencename, ptr) == 0)
+ return(iid);
+
+ while (*ptr)
+ ptr++;
+ ptr++;
+ }
+
+ return(~uint32ZERO);
+}
+
+
+
+uint32
+fastaFile::getSequenceLength(uint32 iid) {
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaFile::getSequenceLength()-- "uint32FMT"\n", iid);
+#endif
+
+ return((iid < _numberOfSequences) ? _index[iid]._seqLength : 0);
+}
+
+
+
+bool
+fastaFile::getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax) {
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaFile::getSequence(full)-- "uint32FMT"\n", iid);
+#endif
+
+ // Assume there is no index. Without being horribly complicated
+ // (as in the previous versions of this codebase) all we'd get from
+ // having an index around is the length of the sequence.
+ //
+ // Previous versions used to use the index to tell if the sequence
+ // was squeezed (and so a direct copy to the output), if it was
+ // fixed width (mostly direct copies) or unknown. Now we just
+ // assume it's unknown and go byte by byte. If speed is a concern,
+ // use the seqFile instead.
+
+ if (iid >= _header._numberOfSequences) {
+ fprintf(stderr, "fastaFile::getSequence(full)-- iid "uint32FMT" more than number of sequences "uint32FMT"\n",
+ iid, _header._numberOfSequences);
+ return(false);
+ }
+
+ if (sMax == 0) {
+ sMax = 2048;
+ s = new char [sMax];
+ }
+
+ if (hMax == 0) {
+ hMax = 2048;
+ h = new char [hMax];
+ }
+
+ if ((_index) && (sMax < _index[iid]._seqLength)) {
+ sMax = _index[iid]._seqLength;
+ delete [] s;
+ s = new char [sMax];
+ }
+
+ hLen = 0;
+ sLen = 0;
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaFile::getSequence(full)-- seek to iid="uint32FMT" at pos="uint32FMT"\n",
+ iid, _index[iid]._seqPosition);
+#endif
+ _rb->seek(_index[iid]._seqPosition);
+
+ char x = _rb->read();
+
+ // Skip whitespace at the start of the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // We should be at a '>' character now. Fail if not.
+ if (_rb->eof())
+ return(false);
+ if (x != '>')
+ fprintf(stderr, "fastaFile::getSequence(full)-- ERROR1: In %s, expected '>' at beginning of defline, got '%c' instead.\n",
+ _filename, x), exit(1);
+
+ // Skip the '>' in the defline
+ x = _rb->read();
+
+ // Skip whitespace between the '>' and the defline
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true) && (x != '\r') && (x != '\n'))
+ x = _rb->read();
+
+ // Copy the defline, until the first newline.
+ while ((_rb->eof() == false) && (x != '\r') && (x != '\n')) {
+ h[hLen++] = x;
+ if (hLen >= hMax) {
+ hMax += 2048;
+ char *H = new char [hMax];
+ memcpy(H, h, hLen);
+ delete [] h;
+ h = H;
+ }
+ x = _rb->read();
+ }
+ h[hLen] = 0;
+
+ // Skip whitespace between the defline and the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // Copy the sequence, until EOF or the next '>'.
+ while ((_rb->eof() == false) && (_rb->peek() != '>')) {
+ if (whitespaceSymbol[x] == false) {
+ s[sLen++] = x;
+ if (sLen >= sMax) {
+ if (sMax == 4294967295) // 4G - 1
+ fprintf(stderr, "fastaFile::getSequence()-- ERROR: sequence is too long; must be less than 4 Gbp.\n"), exit(1);
+ if (sMax >= 2147483648) // 2G
+ sMax = 4294967295;
+ else
+ sMax *= 2;
+ char *S = new char [sMax];
+ memcpy(S, s, sLen);
+ delete [] s;
+ s = S;
+ }
+ }
+ x = _rb->read();
+ }
+ s[sLen] = 0;
+
+ _nextID++;
+
+ return(true);
+}
+
+
+// slow
+bool
+fastaFile::getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s) {
+
+ if (iid >= _header._numberOfSequences) {
+ fprintf(stderr, "fastaFile::getSequence(part)-- iid "uint32FMT" more than number of sequences "uint32FMT"\n",
+ iid, _header._numberOfSequences);
+ return(false);
+ }
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaFile::getSequence(part)-- "uint32FMT"\n", iid);
+#endif
+
+ // It is impossible to be efficient here; see the big comment in
+ // the other getSequence() above.
+ //
+ // We can't even guess where to start scanning the sequence; we
+ // just don't have any information about how much whitespace is in
+ // the sequence.
+
+ _rb->seek(_index[iid]._seqPosition);
+
+ uint32 pos = 0;
+ char x = _rb->read();
+
+ // Skip whitespace at the start of the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // We should be at a '>' character now. Fail if not.
+ if (_rb->eof())
+ return(false);
+ if (x != '>')
+ fprintf(stderr, "fastaFile::getSequence(part)-- ERROR2: In %s, expected '>' at beginning of defline, got '%c' instead.\n",
+ _filename, x), exit(1);
+
+ // Skip the defline.
+ while ((_rb->eof() == false) && (x != '\r') && (x != '\n'))
+ x = _rb->read();
+
+ // Skip whitespace between the defline and the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // Skip sequence up until bgn.
+ while ((_rb->eof() == false) && (pos < bgn)) {
+ if (whitespaceSymbol[x] == false)
+ pos++;
+
+ x = _rb->read();
+ }
+
+ // Copy sequence
+ while ((_rb->eof() == false) && (pos < end)) {
+ if (whitespaceSymbol[x] == false)
+ s[pos++ - bgn] = x;
+
+ x = _rb->read();
+ }
+ s[pos - bgn] = 0;
+
+ // Fail if we didn't copy enough stuff.
+ return((pos == end) ? true : false);
+}
+
+
+
+
+void
+fastaFile::clear(void) {
+ memset(_filename, 0, FILENAME_MAX);
+ memset(_typename, 0, FILENAME_MAX);
+
+ strcpy(_typename, "FastA");
+
+ _numberOfSequences = 0;
+
+ _rb = 0L;
+ memset(&_header, 0, sizeof(fastaFileHeader));
+ _index = 0L;
+ _names = 0L;
+ _nextID = 0;
+}
+
+
+
+void
+fastaFile::loadIndex(char *indexname) {
+ struct stat fastastat;
+
+ if (fileExists(indexname) == false)
+ return;
+
+ errno = 0;
+ if (stat(_filename, &fastastat)) {
+ fprintf(stderr, "fastaFile::constructIndex()-- stat of file '%s' failed: %s\n",
+ _filename, strerror(errno));
+ return;
+ }
+
+ FILE *I = fopen(indexname, "r");
+ if (errno) {
+ fprintf(stderr, "fastaFile::constructIndex()-- open of file '%s' failed: %s\n",
+ indexname, strerror(errno));
+ return;
+ }
+
+ fread(&_header, sizeof(fastaFileHeader), 1, I);
+
+ if ((_header._magic[0] != FASTA_MAGICNUMBER1) &&
+ (_header._magic[1] != FASTA_MAGICNUMBER2)) {
+ fprintf(stderr, "fastaFile::constructIndex()-- magic mismatch.\n");
+ fclose(I);
+ return;
+ }
+
+#if 0
+ (_header._fastaModificationTime != (uint64)fastastat.st_mtime)
+ (_header._fastaCreationTime != (uint64)fastastat.st_ctime)
+#endif
+ if (_header._fastaFileSize != (uint64)fastastat.st_size) {
+ fprintf(stderr, "fastaFile::constructIndex()-- stat mismatch.\n");
+ fclose(I);
+ return;
+ }
+
+ _index = new fastaFileIndex [_header._numberOfSequences];
+ _names = new char [_header._namesLength];
+
+ fread(_index, sizeof(fastaFileIndex), _header._numberOfSequences, I);
+ fread(_names, sizeof(char), _header._namesLength, I);
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaFile::constructIndex()-- '%s' LOADED\n", _filename);
+#endif
+
+ fclose(I);
+ return;
+}
+
+
+void
+fastaFile::constructIndex(void) {
+
+ if (_index)
+ return;
+
+ // If the filename ends in '.fasta' then append a 'idx',
+ // otherwise, append '.fastaidx'.
+
+ char indexname[FILENAME_MAX];
+
+ strcpy(indexname, _filename);
+ uint32 l = strlen(_filename);
+ if ((l > 5) && (strcmp(_filename + l - 6, ".fasta") == 0))
+ strcat(indexname, "idx");
+ else
+ strcat(indexname, ".fastaidx");
+
+ // If the index exists, suck it in and return.
+
+ loadIndex(indexname);
+
+ if (_index)
+ return;
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaFile::constructIndex()-- '%s' BUILDING\n", _filename);
+#endif
+
+ // Allocate some space for the index structures.
+
+ uint32 indexMax = 64 * 1024 * 1024 / sizeof(fastaFileIndex);
+ uint32 indexLen = 0;
+
+ _index = new fastaFileIndex [indexMax];
+
+ uint32 namesMax = 32 * 1024 * 1024;
+ uint32 namesLen = 0;
+
+ _names = new char [namesMax];
+
+ // Some local storage
+
+ uint64 seqStart;
+ uint32 seqLen;
+ uint32 seqLenMax = ~uint32ZERO;
+ uint32 namePos;
+
+ readBuffer ib(_filename);
+ char x = ib.read();
+
+#ifdef DEBUGINDEX
+ fprintf(stderr, "readBuffer '%s' eof=%d x=%c %d\n", _filename, ib.eof(), x, x);
+#endif
+
+ // Build it.
+
+ // Skip whitespace at the start of the sequence.
+ while ((ib.eof() == false) && (whitespaceSymbol[x] == true)) {
+#ifdef DEBUGINDEX
+ fprintf(stderr, "skip '%c' %d\n", x, x);
+#endif
+ x = ib.read();
+ }
+
+ while (ib.eof() == false) {
+#ifdef DEBUGINDEX
+ fprintf(stderr, "index\n");
+#endif
+
+ // We should be at a '>' character now. Fail if not.
+ if (x != '>')
+ fprintf(stderr, "fastaFile::constructIndex()-- ERROR3: In %s, expected '>' at beginning of defline, got '%c' instead.\n",
+ _filename, x), exit(1);
+
+ // Save info - ib's position is correctly at the first letter in
+ // the defline (which might be whitespace), but the reader
+ // expects our position to be at the '>' -- hence the -1.
+ seqStart = ib.tell() - 1;
+ seqLen = 0;
+ namePos = namesLen;
+
+ // Read that first letter
+ x = ib.read();
+
+ // Copy the name to the names
+ while ((ib.eof() == false) && (whitespaceSymbol[x] == false)) {
+ if (namesLen + 1 >= namesMax) {
+ namesMax += 32 * 1024 * 1024;
+ char *nt = new char [namesMax];
+ memcpy(nt, _names, namesLen);
+ delete [] _names;
+ _names = nt;
+ }
+
+ _names[namesLen++] = x;
+#ifdef DEBUGINDEX
+ fprintf(stderr, "name += %c\n", x);
+#endif
+ x = ib.read();
+ }
+
+ if (namesLen + 1 >= namesMax) {
+ namesMax += 32 * 1024 * 1024;
+ char *nt = new char [namesMax];
+ memcpy(nt, _names, namesLen);
+ delete [] _names;
+ _names = nt;
+ }
+ _names[namesLen++] = 0;
+
+ // Skip the rest of the defline
+ while ((ib.eof() == false) && (x != '\r') && (x != '\n')) {
+#ifdef DEBUGINDEX
+ fprintf(stderr, "skip let %c\n", x);
+#endif
+ x = ib.read();
+ }
+
+ // Skip whitespace between the defline and the sequence.
+ while ((ib.eof() == false) && (whitespaceSymbol[x] == true)) {
+#ifdef DEBUGINDEX
+ fprintf(stderr, "skip num %d\n", x);
+#endif
+ x = ib.read();
+ }
+
+#ifdef DEBUGINDEX
+ fprintf(stderr, "x=%c peek=%c\n", x, ib.peek());
+#endif
+
+ // Count sequence length
+ while ((ib.eof() == false) && (ib.peek() != '>')) {
+#ifdef DEBUGINDEX
+ fprintf(stderr, "seqlen %s %c\n", (whitespaceSymbol[x] == false) ? "save" : "skip", x);
+#endif
+ if (whitespaceSymbol[x] == false)
+ seqLen++;
+ if (seqLen >= seqLenMax)
+ fprintf(stderr, "fastaFile::constructIndex()-- ERROR: In %s, sequence '%s' is too long. Maximum length is %u bases.\n",
+ _filename, _names + namePos, seqLenMax), exit(1);
+ x = ib.read();
+ }
+
+ // Save to the index.
+
+ if (indexLen >= indexMax) {
+ indexMax *= 2;
+ fastaFileIndex *et = new fastaFileIndex[indexMax];
+ memcpy(et, _index, sizeof(fastaFileIndex) * indexLen);
+ delete [] _index;
+ _index = et;
+ }
+
+ _index[indexLen]._seqPosition = seqStart;
+ _index[indexLen]._seqLength = seqLen;
+
+#ifdef DEBUG
+ fprintf(stderr, "INDEX iid="uint32FMT" len="uint32FMT" pos="uint64FMT"\n",
+ indexLen, seqLen, seqStart);
+#endif
+
+ indexLen++;
+
+ // Load the '>' for the next iteration.
+ x = ib.read();
+ }
+
+ // Fill out the index meta data
+
+ struct stat fastastat;
+ errno = 0;
+ if (stat(_filename, &fastastat))
+ fprintf(stderr, "fastaFile::constructIndex()-- stat() of file '%s' failed: %s\n",
+ _filename, strerror(errno)), exit(1);
+
+ _header._magic[0] = FASTA_MAGICNUMBER1;
+ _header._magic[1] = FASTA_MAGICNUMBER2;
+ _header._numberOfSequences = indexLen;
+ _header._namesLength = namesLen;
+ _header._fastaFileSize = fastastat.st_size;
+ _header._fastaModificationTime = fastastat.st_mtime;
+ _header._fastaCreationTime = fastastat.st_ctime;
+
+ // Dump the index, if possible.
+
+ errno = 0;
+ FILE *I = fopen(indexname, "w");
+ if (errno)
+ return;
+
+ fwrite(&_header, sizeof(fastaFileHeader), 1, I);
+ fwrite( _index, sizeof(fastaFileIndex), _header._numberOfSequences, I);
+ fwrite( _names, sizeof(char), _header._namesLength, I);
+
+ fclose(I);
+}
diff --git a/libseq/fastaFile.H b/libseq/fastaFile.H
new file mode 100644
index 0000000..775853d
--- /dev/null
+++ b/libseq/fastaFile.H
@@ -0,0 +1,66 @@
+#ifndef FASTAFILE_H
+#define FASTAFILE_H
+
+#include "util++.H"
+#include "bio++.H"
+
+#include "seqFile.H"
+
+struct fastaFileHeader {
+ uint64 _magic[2];
+ uint32 _numberOfSequences; // Number of sequences in the file
+ uint32 _namesLength; // Bytes in the names
+ uint64 _fastaFileSize; // st_size - size of file in bytes
+ uint64 _fastaModificationTime; // st_mtime - time of last data modification
+ uint64 _fastaCreationTime; // st_ctime - time of last file status change
+};
+
+
+struct fastaFileIndex {
+ uint64 _seqPosition; // Position of the sequence in the file
+ uint32 _seqLength; // Length of the sequence (no whitespace counted)
+};
+
+
+class fastaFile : public seqFile {
+protected:
+ fastaFile(const char *filename);
+ fastaFile();
+
+public:
+ ~fastaFile();
+
+protected:
+ seqFile *openFile(const char *filename);
+
+public:
+ uint32 find(const char *sequencename);
+
+ uint32 getSequenceLength(uint32 iid);
+ bool getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax);
+ bool getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s);
+
+private:
+ void clear(void);
+ void loadIndex(char *indexname);
+ void constructIndex(void);
+
+ readBuffer *_rb;
+
+ fastaFileHeader _header;
+ fastaFileIndex *_index;
+ char *_names;
+
+ uint32 _nextID; // Next sequence in the read buffer
+
+ uint32 _gs_iid;
+ uint32 _gs_pos;
+
+ friend class seqFactory;
+};
+
+
+#endif // FASTAFILE_H
diff --git a/libseq/fastaStdin.C b/libseq/fastaStdin.C
new file mode 100644
index 0000000..f3b3534
--- /dev/null
+++ b/libseq/fastaStdin.C
@@ -0,0 +1,265 @@
+#include "fastaStdin.H"
+#include "alphabet.h"
+
+fastaStdin::fastaStdin(const char *filename) {
+ clear();
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaStdin::fastaStdin()-- '%s'\n", (filename) ? filename : "NULLPOINTER");
+#endif
+
+ if (filename == 0L) {
+ strcpy(_filename, "(stdin)");
+ _rb = new readBuffer("-");
+ } else {
+
+ _pipe = popen(filename, "r");
+ _rb = new readBuffer(_pipe);
+ }
+}
+
+
+fastaStdin::fastaStdin() {
+ clear();
+}
+
+
+
+fastaStdin::~fastaStdin() {
+ delete _rb;
+ delete [] _header;
+ delete [] _sequence;
+}
+
+
+
+seqFile *
+fastaStdin::openFile(const char *filename) {
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaStdin::openFile()-- '%s'\n", (filename) ? filename : "NULLPOINTER");
+#endif
+
+ if (((filename == 0L) && (isatty(fileno(stdin)) == 0)) ||
+ ((filename != 0L) && (filename[0] == '-') && (filename[1] == 0)))
+ return(new fastaStdin(0L));
+
+ if (filename == 0L)
+ return(0L);
+
+ uint32 fl = strlen(filename);
+ char cmd[32 + fl];
+
+ if ((filename[fl-3] == '.') && (filename[fl-2] == 'g') && (filename[fl-1] == 'z'))
+ sprintf(cmd, "gzip -dc %s", filename);
+
+ else if ((filename[fl-4] == '.') && (filename[fl-3] == 'b') && (filename[fl-2] == 'z') && (filename[fl-1] == '2'))
+ sprintf(cmd, "bzip2 -dc %s", filename);
+
+ else if ((filename[fl-3] == '.') && (filename[fl-2] == 'x') && (filename[fl-1] == 'z'))
+ sprintf(cmd, "xz -dc %s", filename);
+
+ else
+ return(0L);
+
+ return(new fastaStdin(cmd));
+}
+
+
+
+uint32
+fastaStdin::getNumberOfSequences(void) {
+ if (_rb->peek() == 0)
+ return(_nextIID);
+ else
+ return(_nextIID + 1);
+}
+
+
+uint32
+fastaStdin::find(const char *sequencename) {
+ fprintf(stderr, "fastaStdin::find()-- ERROR! Used for random access.\n");
+ assert(0);
+ return(~uint32ZERO);
+}
+
+
+
+uint32
+fastaStdin::getSequenceLength(uint32 iid) {
+
+ if (iid == _nextIID)
+ if (loadNextSequence(_header, _headerLen, _headerMax, _sequence, _sequenceLen, _sequenceMax) == false)
+ return(0);
+
+ if (iid + 1 != _nextIID) {
+ fprintf(stderr, "fastaStdin::getSequenceLength()-- ERROR! Used for random access. Requested iid=%u, at iid=%u\n",
+ iid, _nextIID);
+ assert(0);
+ }
+
+ return(strlen(_sequence));
+}
+
+
+
+bool
+fastaStdin::getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax) {
+ bool ret = true;
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaStdin::getSequence(full)-- "uint32FMT"\n", iid);
+#endif
+
+ if (iid == _nextIID)
+ if (loadNextSequence(_header, _headerLen, _headerMax, _sequence, _sequenceLen, _sequenceMax) == false)
+ return(false);
+
+ if (iid + 1 != _nextIID) {
+ fprintf(stderr, "fastaStdin::getSequence(full)-- ERROR! Used for random access. Requested iid=%u, at iid=%u\n",
+ iid, _nextIID);
+ assert(0);
+ }
+
+ if (hLen < _headerMax) {
+ delete [] h;
+ hMax = _headerMax;
+ h = new char [hMax];
+ }
+
+ if (sLen < _sequenceMax) {
+ delete [] s;
+ sMax = _sequenceMax;
+ s = new char [sMax];
+ }
+
+ memcpy(h, _header, _headerLen + 1);
+ hLen = _headerLen;
+
+ memcpy(s, _sequence, _sequenceLen + 1);
+ sLen = _sequenceLen;
+
+ return(true);
+}
+
+
+
+bool
+fastaStdin::getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s) {
+
+#ifdef DEBUG
+ fprintf(stderr, "fastaStdin::getSequence(part)-- "uint32FMT"\n", iid);
+#endif
+ fprintf(stderr, "fastaStdin::getSequence(part)-- ERROR! Used for random access.\n");
+ assert(0);
+ return(false);
+}
+
+
+
+void
+fastaStdin::clear(void) {
+ memset(_filename, 0, FILENAME_MAX);
+ memset(_typename, 0, FILENAME_MAX);
+
+ _randomAccessSupported = false;
+
+ strcpy(_typename, "FastAstream");
+
+ _numberOfSequences = ~uint32ZERO;
+
+ _rb = 0L;
+ _nextIID = 0;
+ _pipe = 0L;
+
+ _header = 0L;
+ _headerLen = 0;
+ _headerMax = 0;
+
+ _sequence = 0L;
+ _sequenceLen = 0;
+ _sequenceMax = 0;
+}
+
+
+
+bool
+fastaStdin::loadNextSequence(char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax) {
+
+ if (hMax == 0) {
+ hMax = 2048;
+ h = new char [hMax];
+ }
+
+ if (sMax == 0) {
+ sMax = 2048;
+ s = new char [sMax];
+ }
+
+ hLen = 0;
+ sLen = 0;
+
+ char x = _rb->read();
+
+ // Skip whitespace at the start of the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // We should be at a '>' character now. Fail if not.
+ if (_rb->eof() == true)
+ return(false);
+ if (x != '>')
+ fprintf(stderr, "fastaStdin::loadNextSequence(part)-- ERROR: In %s, expected '>' at beginning of defline, got '%c' instead.\n",
+ _filename, x), exit(1);
+
+ // Skip the '>' in the defline
+ x = _rb->read();
+
+ // Skip whitespace between the '>' and the defline
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true) && (x != '\r') && (x != '\n'))
+ x = _rb->read();
+
+ // Copy the defline, until the first newline.
+ while ((_rb->eof() == false) && (x != '\r') && (x != '\n')) {
+ h[hLen++] = x;
+ if (hLen >= hMax) {
+ //fprintf(stderr, "realloc header\n");
+ hMax += 2048;
+ char *H = new char [hMax];
+ memcpy(H, h, hLen);
+ delete [] h;
+ h = H;
+ }
+ x = _rb->read();
+ }
+ h[hLen] = 0;
+
+ // Skip whitespace between the defline and the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // Copy the sequence, until EOF or the next '>'.
+ while ((_rb->eof() == false) && (_rb->peek() != '>')) {
+ if (whitespaceSymbol[x] == false) {
+ s[sLen++] = x;
+ if (sLen >= sMax) {
+ //fprintf(stderr, "realloc sequence\n");
+ sMax *= 2;
+ char *S = new char [sMax];
+ memcpy(S, s, sLen);
+ delete [] s;
+ s = S;
+ }
+ }
+ x = _rb->read();
+ }
+ s[sLen] = 0;
+
+ _nextIID++;
+
+ return(true);
+}
diff --git a/libseq/fastaStdin.H b/libseq/fastaStdin.H
new file mode 100644
index 0000000..22cb110
--- /dev/null
+++ b/libseq/fastaStdin.H
@@ -0,0 +1,56 @@
+#ifndef FASTASTDIN_H
+#define FASTASTDIN_H
+
+#include "util++.H"
+#include "bio++.H"
+
+#include "seqFile.H"
+
+
+class fastaStdin : public seqFile {
+protected:
+ fastaStdin(const char *filename);
+ fastaStdin();
+
+public:
+ ~fastaStdin();
+
+protected:
+ seqFile *openFile(const char *filename);
+
+public:
+ uint32 getNumberOfSequences(void);
+
+public:
+ uint32 find(const char *sequencename);
+
+ uint32 getSequenceLength(uint32 iid);
+ bool getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax);
+ bool getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s);
+
+private:
+ void clear(void);
+ bool loadNextSequence(char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax);
+
+ readBuffer *_rb;
+ uint32 _nextIID;
+
+ FILE *_pipe;
+
+ char *_header;
+ uint32 _headerLen;
+ uint32 _headerMax;
+
+ char *_sequence;
+ uint32 _sequenceLen;
+ uint32 _sequenceMax;
+
+ friend class seqFactory;
+};
+
+
+#endif // FASTASTDIN_H
diff --git a/libseq/fastqFile.C b/libseq/fastqFile.C
new file mode 100644
index 0000000..8042582
--- /dev/null
+++ b/libseq/fastqFile.C
@@ -0,0 +1,593 @@
+#include "fastqFile.H"
+#include "alphabet.h"
+
+
+#undef DEBUG
+#undef DEBUGINDEX
+
+// Says 'kmerFastaFileIdx'
+#define FASTQ_MAGICNUMBER1 0x7473614672656d6bULL
+#define FASTQ_MAGICNUMBER2 0x786449656c694661ULL
+
+
+fastqFile::fastqFile(const char *filename) {
+ clear();
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqFile::fastqFile()-- '%s'\n", (filename) ? filename : "NULLPOINTER");
+#endif
+
+ strcpy(_filename, filename);
+
+ constructIndex();
+
+ _rb = new readBuffer(_filename);
+
+ _numberOfSequences = _header._numberOfSequences;
+}
+
+
+
+fastqFile::fastqFile() {
+ clear();
+}
+
+
+
+fastqFile::~fastqFile() {
+ delete _rb;
+ delete [] _index;
+ delete [] _names;
+}
+
+
+
+
+
+seqFile *
+fastqFile::openFile(const char *filename) {
+ struct stat st;
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqFile::openFile()-- '%s'\n", (filename) ? filename : "NULLPOINTER");
+#endif
+
+ if (((filename == 0L) && (isatty(fileno(stdin)) == 0)) ||
+ ((filename != 0L) && (filename[0] == '-') && (filename[1] == 0)))
+ return(0L);
+
+ errno = 0;
+ stat(filename, &st);
+ if (errno)
+ return(0L);
+ if ((st.st_mode & S_IFREG) == 0)
+ return(0L);
+
+ // Otherwise, open and see if we can get the first sequence. We
+ // assume it's fastq if we find a '>' denoting a defline the first
+ // thing in the file.
+ //
+ // Use of a readBuffer here is a bit heavyweight, but it's safe and
+ // easy. Opening a fastqFile isn't, after all, lightweight anyway.
+ //
+ fastqFile *f = 0L;
+ readBuffer *r = new readBuffer(filename);
+ char x = r->read();
+
+ while ((r->eof() == false) && (whitespaceSymbol[x] == true))
+ x = r->read();
+
+ // If we get a fastq record separator assume it's a fastq file. If
+ // it's eof, the file is empty, and we might as well return this
+ // fastq file and let the client deal with the lack of sequence.
+ //
+ if ((x == '@') || (r->eof() == true))
+ f = new fastqFile(filename);
+
+ delete r;
+
+ return(f);
+}
+
+
+
+uint32
+fastqFile::find(const char *sequencename) {
+ char *ptr = _names;
+
+ // If this proves far too slow, rewrite the _names string to
+ // separate IDs with 0xff, then use strstr on the whole thing. To
+ // find the ID, scan down the string counting the number of 0xff's.
+ //
+ // Similar code is used for seqStore::find()
+
+ for (uint32 iid=0; iid < _header._numberOfSequences; iid++) {
+ //fprintf(stderr, "fastqFile::find()-- '%s' vs '%s'\n", sequencename, ptr);
+ if (strcmp(sequencename, ptr) == 0)
+ return(iid);
+
+ while (*ptr)
+ ptr++;
+ ptr++;
+ }
+
+ return(~uint32ZERO);
+}
+
+
+
+uint32
+fastqFile::getSequenceLength(uint32 iid) {
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqFile::getSequenceLength()-- "uint32FMT"\n", iid);
+#endif
+
+ return((iid < _numberOfSequences) ? _index[iid]._seqLength : 0);
+}
+
+
+
+bool
+fastqFile::getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax) {
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqFile::getSequence(full)-- "uint32FMT"\n", iid);
+#endif
+
+ if (iid >= _header._numberOfSequences) {
+ fprintf(stderr, "fastqFile::getSequence(full)-- iid "uint32FMT" more than number of sequences "uint32FMT"\n",
+ iid, _header._numberOfSequences);
+ return(false);
+ }
+
+ if (sMax == 0) {
+ sMax = 2048;
+ s = new char [sMax];
+ }
+
+ if (hMax == 0) {
+ hMax = 2048;
+ h = new char [hMax];
+ }
+
+ if ((_index) && (sMax < _index[iid]._seqLength)) {
+ sMax = _index[iid]._seqLength;
+ delete [] s;
+ s = new char [sMax];
+ }
+
+ hLen = 0;
+ sLen = 0;
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqFile::getSequence(full)-- seek to iid="uint32FMT" at pos="uint32FMT"\n",
+ iid, _index[iid]._seqPosition);
+#endif
+ _rb->seek(_index[iid]._seqPosition);
+
+ char x = _rb->read();
+
+ // Skip whitespace at the start of the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // We should be at a '@' character now. Fail if not.
+ if (_rb->eof())
+ return(false);
+ if (x != '@')
+ fprintf(stderr, "fastqFile::getSequence(full)-- ERROR1: In %s, expected '@' at beginning of defline, got '%c' instead.\n",
+ _filename, x), exit(1);
+
+ // Skip the '@' in the defline
+ x = _rb->read();
+
+ // Skip whitespace between the '@' and the defline
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true) && (x != '\r') && (x != '\n'))
+ x = _rb->read();
+
+ // Copy the defline, until the first newline.
+ while ((_rb->eof() == false) && (x != '\r') && (x != '\n')) {
+ h[hLen++] = x;
+ if (hLen >= hMax) {
+ hMax += 2048;
+ char *H = new char [hMax];
+ memcpy(H, h, hLen);
+ delete [] h;
+ h = H;
+ }
+ x = _rb->read();
+ }
+ h[hLen] = 0;
+
+ // Skip whitespace between the defline and the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // Copy the sequence, until EOF or the start of the QV bases.
+ while ((_rb->eof() == false) && (x != '+')) {
+ if (whitespaceSymbol[x] == false) {
+ s[sLen++] = x;
+ if (sLen >= sMax) {
+ if (sMax == 4294967295) // 4G - 1
+ fprintf(stderr, "fastqFile::getSequence()-- ERROR: sequence is too long; must be less than 4 Gbp.\n"), exit(1);
+ if (sMax >= 2147483648) // 2G
+ sMax = 4294967295;
+ else
+ sMax *= 2;
+ char *S = new char [sMax];
+ memcpy(S, s, sLen);
+ delete [] s;
+ s = S;
+ }
+ }
+ x = _rb->read();
+ }
+ s[sLen] = 0;
+
+ // Skip the rest of the QV id line and then the entire QV line.
+
+ //x = _rb->read();
+ assert((_rb->eof() == true) || (x == '+'));
+
+ while ((_rb->eof() == false) && (x != '\r') && (x != '\n'))
+ x = _rb->read();
+ x = _rb->read();
+ while ((_rb->eof() == false) && (x != '\r') && (x != '\n'))
+ x = _rb->read();
+
+ _nextID++;
+
+ return(true);
+}
+
+
+// slow
+bool
+fastqFile::getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s) {
+
+ if (iid >= _header._numberOfSequences) {
+ fprintf(stderr, "fastqFile::getSequence(part)-- iid "uint32FMT" more than number of sequences "uint32FMT"\n",
+ iid, _header._numberOfSequences);
+ return(false);
+ }
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqFile::getSequence(part)-- "uint32FMT"\n", iid);
+#endif
+
+ // Unlike the fasta version of this, we know that all the sequence is on one line. However, we
+ // expect fastq sequences to be small, and we still do the same processing -- character by character.
+
+ _rb->seek(_index[iid]._seqPosition);
+
+ uint32 pos = 0;
+ char x = _rb->read();
+
+ // Skip whitespace at the start of the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // We should be at a '@' character now. Fail if not.
+ if (_rb->eof())
+ return(false);
+ if (x != '@')
+ fprintf(stderr, "fastqFile::getSequence(part)-- ERROR2: In %s, expected '@' at beginning of defline, got '%c' instead.\n",
+ _filename, x), exit(1);
+
+ // Skip the defline.
+ while ((_rb->eof() == false) && (x != '\r') && (x != '\n'))
+ x = _rb->read();
+
+ // Skip whitespace between the defline and the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // Skip sequence up until bgn.
+ while ((_rb->eof() == false) && (pos < bgn)) {
+ if (whitespaceSymbol[x] == false)
+ pos++;
+
+ x = _rb->read();
+ }
+
+ // Copy sequence
+ while ((_rb->eof() == false) && (pos < end)) {
+ if (whitespaceSymbol[x] == false)
+ s[pos++ - bgn] = x;
+
+ x = _rb->read();
+ }
+ s[pos - bgn] = 0;
+
+ // Fail if we didn't copy enough stuff.
+ return((pos == end) ? true : false);
+}
+
+
+
+
+void
+fastqFile::clear(void) {
+ memset(_filename, 0, FILENAME_MAX);
+ memset(_typename, 0, FILENAME_MAX);
+
+ strcpy(_typename, "Fastq");
+
+ _numberOfSequences = 0;
+
+ _rb = 0L;
+ memset(&_header, 0, sizeof(fastqFileHeader));
+ _index = 0L;
+ _names = 0L;
+ _nextID = 0;
+}
+
+
+
+void
+fastqFile::loadIndex(char *indexname) {
+ struct stat fastqstat;
+
+ if (fileExists(indexname) == false)
+ return;
+
+ errno = 0;
+ if (stat(_filename, &fastqstat)) {
+ fprintf(stderr, "fastqFile::constructIndex()-- stat of file '%s' failed: %s\n",
+ _filename, strerror(errno));
+ return;
+ }
+
+ FILE *I = fopen(indexname, "r");
+ if (errno) {
+ fprintf(stderr, "fastqFile::constructIndex()-- open of file '%s' failed: %s\n",
+ indexname, strerror(errno));
+ return;
+ }
+
+ fread(&_header, sizeof(fastqFileHeader), 1, I);
+
+ if ((_header._magic[0] != FASTQ_MAGICNUMBER1) &&
+ (_header._magic[1] != FASTQ_MAGICNUMBER2)) {
+ fprintf(stderr, "fastqFile::constructIndex()-- magic mismatch.\n");
+ fclose(I);
+ return;
+ }
+
+ if ((_header._fastqFileSize != (uint64)fastqstat.st_size) ||
+ (_header._fastqModificationTime != (uint64)fastqstat.st_mtime) ||
+ (_header._fastqCreationTime != (uint64)fastqstat.st_ctime)) {
+ fprintf(stderr, "fastqFile::constructIndex()-- stat mismatch.\n");
+ fclose(I);
+ return;
+ }
+
+ _index = new fastqFileIndex [_header._numberOfSequences];
+ _names = new char [_header._namesLength];
+
+ fread(_index, sizeof(fastqFileIndex), _header._numberOfSequences, I);
+ fread(_names, sizeof(char), _header._namesLength, I);
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqFile::constructIndex()-- '%s' LOADED\n", _filename);
+#endif
+
+ fclose(I);
+ return;
+}
+
+
+void
+fastqFile::constructIndex(void) {
+
+ if (_index)
+ return;
+
+ // If the filename ends in '.fastq' then append a 'idx',
+ // otherwise, append '.fastqidx'.
+
+ char indexname[FILENAME_MAX];
+
+ strcpy(indexname, _filename);
+ uint32 l = strlen(_filename);
+ if ((l > 5) && (strcmp(_filename + l - 6, ".fastq") == 0))
+ strcat(indexname, "idx");
+ else
+ strcat(indexname, ".fastqidx");
+
+ // If the index exists, suck it in and return.
+
+ loadIndex(indexname);
+
+ if (_index)
+ return;
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqFile::constructIndex()-- '%s' BUILDING\n", _filename);
+#endif
+
+ // Allocate some space for the index structures.
+
+ uint32 indexMax = 64 * 1024 * 1024 / sizeof(fastqFileIndex);
+ uint32 indexLen = 0;
+
+ _index = new fastqFileIndex [indexMax];
+
+ uint32 namesMax = 32 * 1024 * 1024;
+ uint32 namesLen = 0;
+
+ _names = new char [namesMax];
+
+ // Some local storage
+
+ uint64 seqStart;
+ uint32 seqLen;
+ uint32 seqLenMax = ~uint32ZERO;
+ uint32 namePos;
+
+ readBuffer ib(_filename);
+ char x = ib.read();
+
+#ifdef DEBUGINDEX
+ fprintf(stderr, "readBuffer '%s' eof=%d x=%c %d\n", _filename, ib.eof(), x, x);
+#endif
+
+ // Build it.
+
+ // Skip whitespace at the start of the sequence.
+ while ((ib.eof() == false) && (whitespaceSymbol[x] == true)) {
+#ifdef DEBUGINDEX
+ fprintf(stderr, "skip '%c' %d\n", x, x);
+#endif
+ x = ib.read();
+ }
+
+ while (ib.eof() == false) {
+#ifdef DEBUGINDEX
+ fprintf(stderr, "index\n");
+#endif
+
+ // We should be at a '@' character now. Fail if not.
+ if (x != '@')
+ fprintf(stderr, "fastqFile::constructIndex()-- ERROR3: In %s, expected '@' at beginning of defline, got '%c' instead.\n",
+ _filename, x), exit(1);
+
+ // Save info - ib's position is correctly at the first letter in
+ // the defline (which might be whitespace), but the reader
+ // expects our position to be at the '@' -- hence the -1.
+ seqStart = ib.tell() - 1;
+ seqLen = 0;
+ namePos = namesLen;
+
+ // Read that first letter
+ x = ib.read();
+
+ // Copy the name to the names
+ while ((ib.eof() == false) && (whitespaceSymbol[x] == false)) {
+ if (namesLen + 1 >= namesMax) {
+ namesMax += 32 * 1024 * 1024;
+ char *nt = new char [namesMax];
+ memcpy(nt, _names, namesLen);
+ delete [] _names;
+ _names = nt;
+ }
+
+ _names[namesLen++] = x;
+#ifdef DEBUGINDEX
+ fprintf(stderr, "name += %c\n", x);
+#endif
+ x = ib.read();
+ }
+
+ if (namesLen + 1 >= namesMax) {
+ namesMax += 32 * 1024 * 1024;
+ char *nt = new char [namesMax];
+ memcpy(nt, _names, namesLen);
+ delete [] _names;
+ _names = nt;
+ }
+ _names[namesLen++] = 0;
+
+ // Skip the rest of the defline
+ while ((ib.eof() == false) && (x != '\r') && (x != '\n')) {
+#ifdef DEBUGINDEX
+ fprintf(stderr, "skip let %c\n", x);
+#endif
+ x = ib.read();
+ }
+
+ // Skip whitespace between the defline and the sequence.
+ while ((ib.eof() == false) && (whitespaceSymbol[x] == true)) {
+#ifdef DEBUGINDEX
+ fprintf(stderr, "skip num %d\n", x);
+#endif
+ x = ib.read();
+ }
+
+#ifdef DEBUGINDEX
+ fprintf(stderr, "x=%c peek=%c\n", x, ib.peek());
+#endif
+
+ // Count sequence length
+ while ((ib.eof() == false) && (x != '+')) {
+#ifdef DEBUGINDEX
+ fprintf(stderr, "seqlen %s %c\n", (whitespaceSymbol[x] == false) ? "save" : "skip", x);
+#endif
+ if (whitespaceSymbol[x] == false)
+ seqLen++;
+ if (seqLen >= seqLenMax)
+ fprintf(stderr, "fastqFile::constructIndex()-- ERROR: In %s, sequence '%s' is too long. Maximum length is %u bases.\n",
+ _filename, _names + namePos, seqLenMax), exit(1);
+ x = ib.read();
+ }
+
+ // Save to the index.
+
+ if (indexLen >= indexMax) {
+ fprintf(stderr, "REALLOC len="uint32FMT" from "uint32FMT" to "uint32FMT"\n", indexLen, indexMax, indexMax * 2);
+ indexMax *= 2;
+ fastqFileIndex *et = new fastqFileIndex[indexMax];
+ memcpy(et, _index, sizeof(fastqFileIndex) * indexLen);
+ delete [] _index;
+ _index = et;
+ }
+
+ _index[indexLen]._seqPosition = seqStart;
+ _index[indexLen]._seqLength = seqLen;
+
+#if 0
+ if ((indexLen * sizeof(fastqFileIndex) > 131000) &&
+ (indexLen * sizeof(fastqFileIndex) < 131200))
+ fprintf(stderr, "INDEX pos="uint64FMT" iid="uint32FMT" len="uint32FMT" pos="uint64FMT"\n",
+ indexLen * sizeof(fastqFileIndex), indexLen, seqLen, seqStart);
+#endif
+
+ indexLen++;
+
+ // Skip the rest of the QV def line, then the entire QV line, then load the '@' for the next sequence.
+
+ //x = ib.read();
+ assert((ib.eof() == true) || (x == '+'));
+
+ while ((ib.eof() == false) && (x != '\r') && (x != '\n'))
+ x = ib.read();
+ x = ib.read();
+ while ((ib.eof() == false) && (x != '\r') && (x != '\n'))
+ x = ib.read();
+ while ((ib.eof() == false) && (x != '@'))
+ x = ib.read();
+ }
+
+ // Fill out the index meta data
+
+ struct stat fastqstat;
+ errno = 0;
+ if (stat(_filename, &fastqstat))
+ fprintf(stderr, "fastqFile::constructIndex()-- stat() of file '%s' failed: %s\n",
+ _filename, strerror(errno)), exit(1);
+
+ _header._magic[0] = FASTQ_MAGICNUMBER1;
+ _header._magic[1] = FASTQ_MAGICNUMBER2;
+ _header._numberOfSequences = indexLen;
+ _header._namesLength = namesLen;
+ _header._fastqFileSize = fastqstat.st_size;
+ _header._fastqModificationTime = fastqstat.st_mtime;
+ _header._fastqCreationTime = fastqstat.st_ctime;
+
+ // Dump the index, if possible.
+
+ errno = 0;
+ FILE *I = fopen(indexname, "w");
+ if (errno)
+ return;
+
+ fwrite(&_header, sizeof(fastqFileHeader), 1, I);
+ fwrite( _index, sizeof(fastqFileIndex), _header._numberOfSequences, I);
+ fwrite( _names, sizeof(char), _header._namesLength, I);
+
+ fclose(I);
+}
diff --git a/libseq/fastqFile.H b/libseq/fastqFile.H
new file mode 100644
index 0000000..10da2ac
--- /dev/null
+++ b/libseq/fastqFile.H
@@ -0,0 +1,66 @@
+#ifndef FASTQFILE_H
+#define FASTQFILE_H
+
+#include "util++.H"
+#include "bio++.H"
+
+#include "seqFile.H"
+
+struct fastqFileHeader {
+ uint64 _magic[2];
+ uint32 _numberOfSequences; // Number of sequences in the file
+ uint32 _namesLength; // Bytes in the names
+ uint64 _fastqFileSize; // st_size - size of file in bytes
+ uint64 _fastqModificationTime; // st_mtime - time of last data modification
+ uint64 _fastqCreationTime; // st_ctime - time of last file status change
+};
+
+
+struct fastqFileIndex {
+ uint64 _seqPosition; // Position of the sequence in the file
+ uint32 _seqLength; // Length of the sequence (no whitespace counted)
+};
+
+
+class fastqFile : public seqFile {
+protected:
+ fastqFile(const char *filename);
+ fastqFile();
+
+public:
+ ~fastqFile();
+
+protected:
+ seqFile *openFile(const char *filename);
+
+public:
+ uint32 find(const char *sequencename);
+
+ uint32 getSequenceLength(uint32 iid);
+ bool getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax);
+ bool getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s);
+
+private:
+ void clear(void);
+ void loadIndex(char *indexname);
+ void constructIndex(void);
+
+ readBuffer *_rb;
+
+ fastqFileHeader _header;
+ fastqFileIndex *_index;
+ char *_names;
+
+ uint32 _nextID; // Next sequence in the read buffer
+
+ uint32 _gs_iid;
+ uint32 _gs_pos;
+
+ friend class seqFactory;
+};
+
+
+#endif // FASTQFILE_H
diff --git a/libseq/fastqStdin.C b/libseq/fastqStdin.C
new file mode 100644
index 0000000..0ef37f1
--- /dev/null
+++ b/libseq/fastqStdin.C
@@ -0,0 +1,276 @@
+#include "fastqStdin.H"
+#include "alphabet.h"
+
+fastqStdin::fastqStdin(const char *filename) {
+ clear();
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqStdin::fastqStdin()-- '%s'\n", (filename) ? filename : "NULLPOINTER");
+#endif
+
+ if (filename == 0L) {
+ strcpy(_filename, "(stdin)");
+ _rb = new readBuffer("-");
+
+ } else {
+ _pipe = popen(filename, "r");
+ _rb = new readBuffer(_pipe);
+ }
+}
+
+
+fastqStdin::fastqStdin() {
+ clear();
+}
+
+
+
+fastqStdin::~fastqStdin() {
+ delete _rb;
+ delete [] _header;
+ delete [] _sequence;
+}
+
+
+
+seqFile *
+fastqStdin::openFile(const char *filename) {
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqStdin::openFile()-- '%s'\n", (filename) ? filename : "NULLPOINTER");
+#endif
+
+ if (((filename == 0L) && (isatty(fileno(stdin)) == 0)) ||
+ ((filename != 0L) && (filename[0] == '-') && (filename[1] == 0)))
+ return(new fastqStdin(0L));
+
+ if (filename == 0L)
+ return(0L);
+
+ uint32 fl = strlen(filename);
+ char cmd[32 + fl];
+
+ if ((filename[fl-3] == '.') && (filename[fl-2] == 'g') && (filename[fl-1] == 'z'))
+ sprintf(cmd, "gzip -dc %s", filename);
+
+ else if ((filename[fl-4] == '.') && (filename[fl-3] == 'b') && (filename[fl-2] == 'z') && (filename[fl-1] == '2'))
+ sprintf(cmd, "bzip2 -dc %s", filename);
+
+ else if ((filename[fl-3] == '.') && (filename[fl-2] == 'x') && (filename[fl-1] == 'z'))
+ sprintf(cmd, "xz -dc %s", filename);
+
+ else
+ return(0L);
+
+ return(new fastqStdin(cmd));
+}
+
+
+
+uint32
+fastqStdin::getNumberOfSequences(void) {
+ if (_rb->peek() == 0)
+ return(_nextIID);
+ else
+ return(_nextIID + 1);
+}
+
+
+uint32
+fastqStdin::find(const char *sequencename) {
+ fprintf(stderr, "fastqStdin::find()-- ERROR! Used for random access.\n");
+ assert(0);
+ return(~uint32ZERO);
+}
+
+
+
+uint32
+fastqStdin::getSequenceLength(uint32 iid) {
+
+ if (iid == _nextIID)
+ if (loadNextSequence(_header, _headerLen, _headerMax, _sequence, _sequenceLen, _sequenceMax) == false)
+ return(0);
+
+ if (iid + 1 != _nextIID) {
+ fprintf(stderr, "fastqStdin::getSequence()-- ERROR! Used for random access. Requested iid=%u, at iid=%u\n",
+ iid, _nextIID);
+ assert(0);
+ }
+
+ return(strlen(_sequence));
+}
+
+
+
+bool
+fastqStdin::getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax) {
+ bool ret = true;
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqStdin::getSequence(full)-- "uint32FMT"\n", iid);
+#endif
+
+ if (iid == _nextIID)
+ if (loadNextSequence(_header, _headerLen, _headerMax, _sequence, _sequenceLen, _sequenceMax) == false)
+ return(false);
+
+ if (iid + 1 != _nextIID) {
+ fprintf(stderr, "fastqStdin::getSequence(full)-- ERROR! Used for random access. Requested iid=%u, at iid=%u\n",
+ iid, _nextIID);
+ assert(0);
+ }
+
+ if (hLen < _headerMax) {
+ delete [] h;
+ hMax = _headerMax;
+ h = new char [hMax];
+ }
+
+ if (sLen < _sequenceMax) {
+ delete [] s;
+ sMax = _sequenceMax;
+ s = new char [sMax];
+ }
+
+ memcpy(h, _header, _headerLen + 1);
+ hLen = _headerLen;
+
+ memcpy(s, _sequence, _sequenceLen + 1);
+ sLen = _sequenceLen;
+
+ return(true);
+}
+
+
+
+bool
+fastqStdin::getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s) {
+
+#ifdef DEBUG
+ fprintf(stderr, "fastqStdin::getSequence(part)-- "uint32FMT"\n", iid);
+#endif
+ fprintf(stderr, "fastqStdin::getSequence(part)-- ERROR! Used for random access.\n");
+ assert(0);
+ return(false);
+}
+
+
+
+void
+fastqStdin::clear(void) {
+ memset(_filename, 0, FILENAME_MAX);
+ memset(_typename, 0, FILENAME_MAX);
+
+ _randomAccessSupported = false;
+
+ strcpy(_typename, "FastQstream");
+
+ _numberOfSequences = ~uint32ZERO;
+
+ _rb = 0L;
+ _nextIID = 0;
+ _pipe = 0L;
+
+ _header = 0L;
+ _headerLen = 0;
+ _headerMax = 0;
+
+ _sequence = 0L;
+ _sequenceLen = 0;
+ _sequenceMax = 0;
+}
+
+
+
+bool
+fastqStdin::loadNextSequence(char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax) {
+
+ if (hMax == 0) {
+ hMax = 2048;
+ h = new char [hMax];
+ }
+
+ if (sMax == 0) {
+ sMax = 2048;
+ s = new char [sMax];
+ }
+
+ hLen = 0;
+ sLen = 0;
+
+ char x = _rb->read();
+
+ // Skip whitespace at the start of the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // We should be at a '@' character now. Fail if not.
+ if (_rb->eof() == true)
+ return(false);
+ if (x != '@')
+ fprintf(stderr, "fastqStdin::loadNextSequence(part)-- ERROR: In %s, expected '@' at beginning of defline, got '%c' instead.\n",
+ _filename, x), exit(1);
+
+ // Skip the '@' in the defline
+ x = _rb->read();
+
+ // Skip whitespace between the '@' and the defline
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true) && (x != '\r') && (x != '\n'))
+ x = _rb->read();
+
+ // Copy the defline, until the first newline.
+ while ((_rb->eof() == false) && (x != '\r') && (x != '\n')) {
+ h[hLen++] = x;
+ if (hLen >= hMax) {
+ //fprintf(stderr, "realloc header\n");
+ hMax += 2048;
+ char *H = new char [hMax];
+ memcpy(H, h, hLen);
+ delete [] h;
+ h = H;
+ }
+ x = _rb->read();
+ }
+ h[hLen] = 0;
+
+ // Skip whitespace between the defline and the sequence.
+ while ((_rb->eof() == false) && (whitespaceSymbol[x] == true))
+ x = _rb->read();
+
+ // Copy the sequence, until EOF or the start of the QV bases.
+ while ((_rb->eof() == false) && (_rb->peek() != '+')) {
+ if (whitespaceSymbol[x] == false) {
+ s[sLen++] = x;
+ if (sLen >= sMax) {
+ //fprintf(stderr, "realloc sequence\n");
+ sMax *= 2;
+ char *S = new char [sMax];
+ memcpy(S, s, sLen);
+ delete [] s;
+ s = S;
+ }
+ }
+ x = _rb->read();
+ }
+ s[sLen] = 0;
+
+ // Skip the rest of the QV id line and then the entire QV line.
+
+ //x = _rb->read();
+ assert((_rb->eof() == true) || (x == '+'));
+
+ while ((_rb->eof() == false) && (x != '\r') && (x != '\n'))
+ x = _rb->read();
+ x = _rb->read();
+ while ((_rb->eof() == false) && (x != '\r') && (x != '\n'))
+ x = _rb->read();
+
+ _nextIID++;
+
+ return(true);
+}
diff --git a/libseq/fastqStdin.H b/libseq/fastqStdin.H
new file mode 100644
index 0000000..26ff224
--- /dev/null
+++ b/libseq/fastqStdin.H
@@ -0,0 +1,60 @@
+#ifndef FASTQSTDIN_H
+#define FASTQSTDIN_H
+
+#include "util++.H"
+#include "bio++.H"
+
+#include "seqFile.H"
+
+
+class fastqStdin : public seqFile {
+protected:
+ fastqStdin(const char *filename);
+ fastqStdin();
+
+public:
+ ~fastqStdin();
+
+protected:
+ seqFile *openFile(const char *filename);
+
+public:
+ uint32 getNumberOfSequences(void);
+
+public:
+ uint32 find(const char *sequencename);
+
+ uint32 getSequenceLength(uint32 iid);
+ bool getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax);
+ bool getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s);
+
+private:
+ void clear(void);
+ bool loadNextSequence(char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax);
+
+ readBuffer *_rb;
+ uint32 _nextIID;
+
+ FILE *_pipe;
+
+ char *_header;
+ uint32 _headerLen;
+ uint32 _headerMax;
+
+ char *_sequence;
+ uint32 _sequenceLen;
+ uint32 _sequenceMax;
+
+ char *_quality;
+ uint32 _qualityLen;
+ uint32 _qualityMax;
+
+ friend class seqFactory;
+};
+
+
+#endif // FASTQSTDIN_H
diff --git a/libseq/merStream.C b/libseq/merStream.C
new file mode 100644
index 0000000..a2a5763
--- /dev/null
+++ b/libseq/merStream.C
@@ -0,0 +1,84 @@
+#include "merStream.H"
+
+
+merStream::merStream(kMerBuilder *kb, seqStream *ss, bool kbown, bool ssown) {
+ _kb = kb;
+ _ss = ss;
+
+ _kbdelete = kbown;
+ _ssdelete = ssown;
+
+ _beg = uint64ZERO;
+ _end = ~uint64ZERO;
+
+ _kb->clear();
+
+ _invalid = true;
+}
+
+
+merStream::~merStream() {
+ if (_kbdelete) delete _kb;
+ if (_ssdelete) delete _ss;
+}
+
+
+void
+merStream::rewind(void) {
+ _ss->rewind();
+ _kb->clear();
+ _invalid = true;
+}
+
+
+void
+merStream::rebuild(void) {
+ _ss->setPosition(_ss->strPos() - _kb->theFMer().getMerSpan());
+ _kb->clear();
+ _invalid = true;
+}
+
+
+void
+merStream::setBaseRange(uint64 beg, uint64 end) {
+
+ assert(beg < end);
+
+ //fprintf(stderr, "merStream::setBaseRange()-- from "uint64FMT" to "uint64FMT".\n", beg, end);
+
+ // We can't tell the seqStore when to stop; while we could compute the span of a spaced seed, we
+ // cannot compute it for a compressed seed. We need to stop iterating when the beginning of the
+ // mer reaches the requested end.
+
+ _ss->setRange(beg, ~uint64ZERO);
+
+ _beg = beg;
+ _end = end;
+
+ _kb->clear();
+
+ _invalid = true;
+}
+
+
+uint64
+merStream::approximateNumberOfMers(void) {
+ uint64 approx = _end - _beg;
+ uint64 k = _kb->merSize();
+
+ // If we don't know the range, sum all the sequence lengths, otherwise, it's just the length from
+ // begin to end.
+
+ if (_end == ~uint64ZERO) {
+ approx = uint64ZERO;
+
+ for (uint32 s=0; s<_ss->numberOfSequences(); s++) {
+ uint32 l = _ss->lengthOf(s);
+
+ if (l > k)
+ approx += l - k + 1;
+ }
+ }
+
+ return(approx);
+}
diff --git a/libseq/merStream.H b/libseq/merStream.H
new file mode 100644
index 0000000..520aded
--- /dev/null
+++ b/libseq/merStream.H
@@ -0,0 +1,99 @@
+#ifndef MERSTREAM_H
+#define MERSTREAM_H
+
+#include "util++.H"
+#include "bio++.H"
+
+#include "seqFile.H"
+#include "seqStream.H"
+
+//
+// merStream needs exclusive use of a kMerBuilder and a seqStream.
+//
+// The kMerBuilder can be used over and over. I think snapper is the
+// only one that does this though.
+//
+// The seqStream can be used elsewhere, but ONLY for looking up
+// positions.
+//
+// The merStream does NOT assume ownership of either of these, unless
+// the own flags are set.
+//
+// The stream is not valid until nextMer is called; allowing loops of
+// while (MS->nextMer()) {
+// process(MS->theFMer());
+// }
+//
+// setRange() positions refer to ACGT letters in the input, NOT mers.
+// rewind() repositions the file to the start of the range.
+//
+
+class merStream {
+public:
+ merStream(kMerBuilder *kb, seqStream *ss, bool kbown=false, bool ssown=false);
+ ~merStream();
+
+ kMer const & theFMer(void) { assert(_invalid == false); return(_kb->theFMer()); };
+ kMer const & theRMer(void) { assert(_invalid == false); return(_kb->theRMer()); };
+ kMer const & theCMer(void) { assert(_invalid == false); return(_kb->theCMer()); };
+
+ bool nextMer(uint32 skip=0) {
+ char ch;
+
+ do {
+ ch = _ss->get();
+ if (ch == 0)
+ return(false);
+ } while ((_kb->addBase(ch) == true) || (skip-- > 0));
+
+ _kb->mask();
+ _invalid = false;
+
+#if 0
+ char merstring[256];
+
+ fprintf(stderr, "merStream::nextMer()-- seqPos="uint64FMT" merPos="uint64FMT" span="uint32FMT" base0span="uint32FMT" end="uint64FMT" %s %s\n",
+ _ss->strPos(),
+ _ss->strPos() - theFMer().getMerSpan(),
+ theFMer().getMerSpan(),
+ _kb->baseSpan(0),
+ _end,
+ _kb->theFMer().merToString(merstring),
+ (_ss->strPos() - theFMer().getMerSpan() < _end) ? "" : "STOP");
+#endif
+
+ // The mer is out of range if:
+ // o it begins at or past the _end
+ // o the span of the first base ends at or past the _end
+ //
+ // If the mer isn't spaced, the base span is always 1. If it is spaced, the span will be
+ // between 1 and ... who knows.
+
+ return(_ss->strPos() - theFMer().getMerSpan() + _kb->baseSpan(0) - 1 < _end);
+ };
+
+ void rewind(void);
+ void rebuild(void);
+ void setBaseRange(uint64 beg, uint64 end);
+
+ uint64 thePositionInSequence(void) { assert(_invalid == false); return(_ss->seqPos() - theFMer().getMerSpan()); };
+ uint64 thePositionInStream(void) { assert(_invalid == false); return(_ss->strPos() - theFMer().getMerSpan()); };
+ uint64 theSequenceNumber(void) { assert(_invalid == false); return(_ss->seqIID()); };
+
+ uint64 approximateNumberOfMers(void);
+
+private:
+ kMerBuilder *_kb;
+ seqStream *_ss;
+
+ bool _kbdelete;
+ bool _ssdelete;
+
+ bool _invalid;
+
+ uint64 _beg;
+ uint64 _end;
+};
+
+
+#endif // MERSTREAM_H
diff --git a/libseq/selftest.C b/libseq/selftest.C
new file mode 100644
index 0000000..bb38726
--- /dev/null
+++ b/libseq/selftest.C
@@ -0,0 +1,53 @@
+
+
+ {
+ seqFile *SF = openSeqFile(argv[1]);
+
+ fprintf(stdout, "source '%s' of type '%s' has "uint32FMT" sequences.\n",
+ SF->getSourceName(), SF->getFileTypeName(), SF->getNumberOfSequences());
+
+ fprintf(stdout, "getSequenceLength() vs getSequence(full)\n");
+ {
+ char *h = 0L;
+ char *s = 0L;
+ uint32 hLen=0, hMax=0;
+ uint32 sLen=0, sMax=0;
+
+ for (uint32 sid=0; sid<SF->getNumberOfSequences(); sid++) {
+ SF->getSequence(sid, h, hLen, hMax, s, sLen, sMax);
+
+ if ((strlen(s) != SF->getSequenceLength(sid)) ||
+ (strlen(s) != sLen) ||
+ (SF->getSequenceLength(sid) != sLen)) {
+ fprintf(stdout, "length differ for sid="uint32FMT" h='%s' strlen(s)=%d sLen="uint32FMT" getSequenceLength()="uint32FMT"\n",
+ sid, h, strlen(s), sLen, SF->getSequenceLength(sid));
+ }
+ }
+
+ delete [] h;
+ delete [] s;
+ }
+
+
+ fprintf(stdout, "getSequenceLength() vs getSequence(part)\n");
+ {
+ char *p = new char [128 * 1024 * 1024];
+
+ for (uint32 sid=0; sid<SF->getNumberOfSequences(); sid++) {
+ SF->getSequence(sid, 0, SF->getSequenceLength(sid), p);
+
+ if (strlen(p) != SF->getSequenceLength(sid)) {
+ fprintf(stdout, "length differ for sid="uint32FMT" strlen(s)=%d getSequenceLength()="uint32FMT"\n",
+ sid, strlen(p), SF->getSequenceLength(sid));
+ }
+ }
+
+ delete [] p;
+ }
+
+
+
+
+ return(0);
+}
+
diff --git a/libseq/seqCache.C b/libseq/seqCache.C
new file mode 100644
index 0000000..1e9a69a
--- /dev/null
+++ b/libseq/seqCache.C
@@ -0,0 +1,197 @@
+#include "seqCache.H"
+#include "seqFactory.H"
+#include "alphabet.h"
+
+#undef DEBUG
+
+
+seqCache::seqCache(const char *filename, uint32 cachesize, bool verbose) {
+
+ _fb = openSeqFile(filename);
+ _idToGetNext = 0;
+
+ _allSequencesLoaded = false;
+ _reportLoading = verbose;
+
+ _cacheMap = 0L;
+ _cacheSize = 0;
+ _cacheNext = 0;
+ _cache = 0L;
+
+ setCacheSize(cachesize);
+}
+
+
+
+seqCache::~seqCache() {
+ flushCache();
+ delete _fb;
+ delete [] _cacheMap;
+ delete [] _cache;
+}
+
+
+
+uint32
+seqCache::getSequenceIID(char *name) {
+ uint32 iid = ~uint32ZERO;
+
+ // If the name is all integers, AND below the number of sequences
+ // we have, return that, otherwise, look it up.
+ //
+ bool isInt = true;
+ char *x = name;
+
+ while (*x) {
+ if ((*x < '0') || ('9' < *x))
+ isInt = false;
+ x++;
+ }
+
+ if (isInt)
+ iid = strtouint32(name, 0L);
+
+ if (iid >= _fb->getNumberOfSequences())
+ iid = _fb->find(name);
+
+#ifdef DEBUG
+ fprintf(stderr, "seqCache::getSequenceIID()-- '%s' -> "uint32FMT"\n", name, iid);
+#endif
+
+ return(iid);
+}
+
+
+
+seqInCore *
+seqCache::getSequenceInCore(uint32 iid) {
+ uint32 cacheID = ~uint32ZERO;
+ seqInCore *retSeq = 0L;
+
+ if ((_fb->randomAccessSupported() == true) &&
+ (iid >= _fb->getNumberOfSequences()))
+ return(0L);
+
+ if (_allSequencesLoaded == true) {
+ cacheID = iid;
+
+ } else if ((_cacheSize > 0) && (_cacheMap[iid] != ~uint32ZERO)) {
+ cacheID = _cacheMap[iid];
+
+ } else {
+ uint32 hLen=0, hMax=0, sLen=0, sMax=0;
+ char *h=0L, *s=0L;
+
+ if (_fb->getSequence(iid, h, hLen, hMax, s, sLen, sMax) == false)
+ return(0L);
+
+ retSeq = new seqInCore(iid, h, hLen, s, sLen, true);
+
+ // Remove any old cached sequence, then store the one we just made
+
+ if (_cache) {
+ if (_cache[_cacheNext]) {
+ _cacheMap[_cache[_cacheNext]->getIID()] = ~uint32ZERO;
+ delete _cache[_cacheNext];
+ }
+
+ _cache[_cacheNext] = retSeq;
+ _cacheMap[iid] = _cacheNext;
+
+ cacheID = _cacheNext;
+ retSeq = 0L;
+
+ _cacheNext = (_cacheNext + 1) % _cacheSize;
+ }
+ }
+
+ // If no retSeq set, make a copy of the one we have in the cache.
+
+ if ((retSeq == 0L) && (cacheID != ~uint32ZERO))
+ retSeq = new seqInCore(iid,
+ _cache[cacheID]->header(), _cache[cacheID]->headerLength(),
+ _cache[cacheID]->sequence(), _cache[cacheID]->sequenceLength(),
+ false);
+
+ return(retSeq);
+}
+
+
+
+void
+seqCache::setCacheSize(uint32 cachesize) {
+ uint32 ns = _fb->getNumberOfSequences();
+
+ flushCache();
+
+ if (cachesize == 0) {
+ _cacheMap = 0L;
+ _cacheSize = 0;
+ _cacheNext = 0;
+ _cache = 0L;
+ return;
+ }
+
+ _cacheMap = new uint32 [ns];
+ _cacheSize = cachesize;
+ _cacheNext = 0;
+ _cache = new seqInCore * [_cacheSize];
+
+ for (uint32 i=0; i<ns; i++)
+ _cacheMap[i] = ~uint32ZERO;
+
+ for (uint32 i=0; i<_cacheSize; i++)
+ _cache[i] = 0L;
+}
+
+
+
+void
+seqCache::loadAllSequences(void) {
+
+ if (_allSequencesLoaded)
+ return;
+
+ flushCache();
+
+ delete [] _cacheMap;
+ delete [] _cache;
+
+ _cacheMap = 0L;
+ _cacheSize = _fb->getNumberOfSequences();
+ _cacheNext = 0;
+ _cache = new seqInCore * [_cacheSize];
+
+
+ for (uint32 iid=0; iid<_cacheSize; iid++) {
+ uint32 hLen=0, hMax=0, sLen=0, sMax=0;
+ char *h=0L, *s=0L;
+
+ if (_fb->getSequence(iid, h, hLen, hMax, s, sLen, sMax) == false)
+ fprintf(stderr, "seqCache::loadAllSequences()-- Failed to load iid "uint32FMT".\n",
+ iid), exit(1);
+
+ _cache[iid] = new seqInCore(iid, h, hLen, s, sLen, true);
+ }
+
+ _allSequencesLoaded = true;
+}
+
+void
+seqCache::flushCache(void) {
+
+ if (_fb == 0L)
+ return;
+
+ if (_cacheMap) {
+ uint32 ns = _fb->getNumberOfSequences();
+ for (uint32 i=0; i<ns; i++)
+ _cacheMap[i] = ~uint32ZERO;
+ }
+
+ if (_cache)
+ for (uint32 i=0; i<_cacheSize; i++) {
+ delete _cache[i];
+ _cache[i] = 0L;
+ }
+}
diff --git a/libseq/seqCache.H b/libseq/seqCache.H
new file mode 100644
index 0000000..f59f35a
--- /dev/null
+++ b/libseq/seqCache.H
@@ -0,0 +1,106 @@
+#ifndef SEQCACHE_H
+#define SEQCACHE_H
+
+#include "util++.H"
+#include "seqFile.H"
+
+
+class seqInCore {
+private:
+ seqInCore(uint32 iid, char *hdr, uint32 hdrlen, char *seq, uint32 seqlen, bool deletable) {
+ _idx = iid;
+
+ _deletable = deletable;
+
+ _headerLen = hdrlen;
+ _header = hdr;
+
+ _seqLen = seqlen;
+ _seq = seq;
+ };
+
+ friend class seqCache;
+
+public:
+ ~seqInCore() {
+ if (_deletable) {
+ delete [] _header; _header = 0L;
+ delete [] _seq; _seq = 0L;
+ }
+ };
+
+ char *header(void) const { return(_header); };
+ uint32 headerLength(void) const { return(_headerLen); };
+
+ char *sequence(void) const { return(_seq); };
+ uint32 sequenceLength(void) const { return(_seqLen); };
+
+ uint32 getIID(void) const { return(_idx); };
+
+ // Used only by searchGENOME (as far as I know)
+ seqInCore *copy(void) {
+ char *h = new char [_headerLen + 1];
+ char *s = new char [_seqLen + 1];
+
+ memcpy(h, _header, _headerLen + 1);
+ memcpy(s, _seq, _seqLen + 1);
+
+ return(new seqInCore(_idx, h, _headerLen, s, _seqLen, true));
+ };
+
+private:
+ uint32 _idx;
+
+ bool _deletable;
+
+ uint32 _headerLen;
+ char *_header;
+
+ uint32 _seqLen;
+ char *_seq;
+};
+
+
+
+class seqCache {
+public:
+ seqCache(const char *filename, uint32 cachesize=0, bool verbose=false);
+ ~seqCache();
+
+ // Returns IID for a name, either the first word on the defline, or
+ // the ascii IID.
+ uint32 getSequenceIID(char *name);
+
+ seqInCore *getSequenceInCore(uint32 iid);
+ seqInCore *getSequenceInCore(char *name) { return(getSequenceInCore(getSequenceIID(name))); };
+ seqInCore *getSequenceInCore(void) { return(getSequenceInCore(_idToGetNext++)); };
+
+ const char *getSourceName(void) { return(_fb->getSourceName()); };
+ const char *getFileTypeName(void) { return(_fb->getFileTypeName()); };
+
+ bool randomAccessSupported(void) { return(_fb->randomAccessSupported()); };
+
+ uint32 getNumberOfSequences(void) { return(_fb->getNumberOfSequences()); };
+
+ uint32 getSequenceLength(uint32 iid) { return(_fb->getSequenceLength(iid)); };
+
+ void setCacheSize(uint32 cachesize);
+
+ void loadAllSequences(void);
+ void flushCache(void);
+
+private:
+ seqFile *_fb;
+ uint32 _idToGetNext;
+
+ bool _allSequencesLoaded;
+ bool _reportLoading;
+
+ uint32 *_cacheMap; // Maps ID to cache entry
+ uint32 _cacheSize; // Size of cache
+ uint32 _cacheNext; // Next cache spot to use
+ seqInCore **_cache; // Cache of sequences
+};
+
+
+#endif // SEQCACHE_H
diff --git a/libseq/seqFactory.C b/libseq/seqFactory.C
new file mode 100644
index 0000000..cea1515
--- /dev/null
+++ b/libseq/seqFactory.C
@@ -0,0 +1,60 @@
+#include "seqFactory.H"
+
+#include "fastaFile.H"
+#include "fastaStdin.H"
+#include "fastqFile.H"
+#include "fastqStdin.H"
+#include "seqStore.H"
+
+seqFactory *seqFactory::me = 0L;
+
+
+seqFactory::seqFactory() {
+ _filesNum = 0;
+ _filesMax = 16;
+ _files = new seqFile * [_filesMax];
+
+ registerFile(new fastaFile);
+ registerFile(new fastaStdin);
+ registerFile(new fastqFile);
+ registerFile(new fastqStdin);
+ registerFile(new seqStore);
+ //registerFile(new sffFile);
+}
+
+
+seqFactory::~seqFactory() {
+ for (uint32 i=0; i<_filesNum; i++)
+ delete _files[i];
+ delete [] _files;
+}
+
+
+void
+seqFactory::registerFile(seqFile *f) {
+ if (_filesNum >= _filesMax) {
+ fprintf(stderr, "seqFactory::registerFile()-- Wow! You registered lots of files! Now fix %s at line %d.\n", __FILE__, __LINE__);
+ exit(1);
+ }
+ _files[_filesNum++] = f;
+}
+
+
+seqFile *
+seqFactory::openFile(const char *name) {
+ seqFile *n = 0L;
+
+ for (uint32 i=0; i<_filesNum; i++) {
+ n = _files[i]->openFile(name);
+ if (n)
+ return(n);
+ }
+
+ fprintf(stderr, "seqFactory::registerFile()-- Cannot determine type of file '%s'. Tried:\n", name);
+
+ for (uint32 i=0; i<_filesNum; i++)
+ fprintf(stderr, "seqFactory::registerFile()-- '%s'\n", _files[i]->getFileTypeName());
+
+ exit(1);
+ return(n);
+}
diff --git a/libseq/seqFactory.H b/libseq/seqFactory.H
new file mode 100644
index 0000000..46ea385
--- /dev/null
+++ b/libseq/seqFactory.H
@@ -0,0 +1,33 @@
+#ifndef SEQFACTORY_H
+#define SEQFACTORY_H
+
+#include "util.h"
+#include "seqFile.H"
+
+class seqFactory {
+protected:
+ seqFactory();
+ ~seqFactory();
+
+public:
+ static seqFactory *instance(void) {
+ if (me == 0L)
+ me = new seqFactory;
+ return(me);
+ };
+
+ void registerFile(seqFile *f);
+ seqFile *openFile(const char *name);
+private:
+ static seqFactory *me;
+
+ uint32 _filesNum;
+ uint32 _filesMax;
+ seqFile **_files;
+};
+
+
+#define openSeqFile(S) seqFactory::instance()->openFile((S))
+
+
+#endif // SEQFACTORY_H
diff --git a/libseq/seqFile.H b/libseq/seqFile.H
new file mode 100644
index 0000000..3fa6291
--- /dev/null
+++ b/libseq/seqFile.H
@@ -0,0 +1,54 @@
+#ifndef SEQFILE_H
+#define SEQFILE_H
+
+#include "util.h"
+
+// General flow of the constructors is:
+// Clear all data
+// Open the file
+// Set _filename, _typename
+// Read/build the index structure
+// Position the file to the first read
+// Set _numberOfSequences (IMPORTANT, and subtle)
+
+class seqFile {
+protected:
+ seqFile(const char *filename) {};
+ seqFile() {};
+
+public:
+ virtual ~seqFile() {};
+
+protected:
+ virtual seqFile *openFile(const char *filename) = 0;
+
+public:
+ virtual const char *getSourceName(void) { return(_filename); };
+ virtual const char *getFileTypeName(void) { return(_typename); };
+
+ virtual bool randomAccessSupported(void) { return(_randomAccessSupported); };
+
+ virtual uint32 getNumberOfSequences(void) { return(_numberOfSequences); };
+
+public:
+ virtual uint32 find(const char *sequencename) = 0;
+
+ virtual uint32 getSequenceLength(uint32 id) = 0;
+ virtual bool getSequence(uint32 id,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax) = 0;
+ virtual bool getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s) = 0;
+
+protected:
+ char _filename[FILENAME_MAX];
+ char _typename[FILENAME_MAX];
+
+ bool _randomAccessSupported;
+
+ uint32 _numberOfSequences;
+
+ friend class seqFactory;
+};
+
+#endif // SEQFILE_H
diff --git a/libseq/seqStore.C b/libseq/seqStore.C
new file mode 100644
index 0000000..e01205b
--- /dev/null
+++ b/libseq/seqStore.C
@@ -0,0 +1,622 @@
+
+#include "seqStore.H"
+#include "seqCache.H"
+#include "alphabet.h"
+
+// Says 'kmerSeqStoreFile'
+#define SEQSTORE_MAGICNUMBER1 0x5371655372656d6bULL
+#define SEQSTORE_MAGICNUMBER2 0x656c694665726f74ULL
+
+
+seqStore::seqStore(const char *filename) {
+ clear();
+
+ strcpy(_filename, filename);
+
+ errno = 0;
+ FILE *F = fopen(_filename, "r");
+ if (errno)
+ fprintf(stderr, "seqStore::seqStore()-- Failed to open '%s': %s\n",
+ _filename, strerror(errno)), exit(1);
+ fread(&_header, sizeof(seqStoreHeader), 1, F);
+ fclose(F);
+
+ //_indexBPF = new bitPackedFile(_filename, _header._indexStart);
+ //_blockBPF = new bitPackedFile(_filename, _header._blockStart);
+ //_namesBPF = new bitPackedFile(_filename, _header._namesStart);
+
+ _bpf = new bitPackedFile(_filename, sizeof(seqStoreHeader));
+
+ _numberOfSequences = _header._numberOfSequences;
+}
+
+
+
+seqStore::seqStore() {
+ clear();
+}
+
+
+
+seqStore::~seqStore() {
+ //if ((_filename) && (_filename[0] != 0))
+ // fprintf(stderr, "Closing seqStore '%s'\n", _filename);
+ delete _bpf;
+ delete [] _index;
+ delete [] _block;
+ delete [] _names;
+ delete _indexBPF;
+ delete _blockBPF;
+ delete _namesBPF;
+}
+
+
+
+seqFile *
+seqStore::openFile(const char *filename) {
+ uint64 magic1, magic2;
+ struct stat st;
+
+ errno = 0;
+ stat(filename, &st);
+ if (errno)
+ return(0L);
+ if ((st.st_mode & S_IFREG) == 0)
+ return(0L);
+
+ // Check the magic. Fail if not correct.
+
+ errno = 0;
+ FILE *F = fopen(filename, "r");
+ if (errno)
+ return(0L);
+ fread(&magic1, sizeof(uint64), 1, F);
+ fread(&magic2, sizeof(uint64), 1, F);
+ fclose(F);
+ if ((magic1 != SEQSTORE_MAGICNUMBER1) || (magic2 != SEQSTORE_MAGICNUMBER2))
+ return(0L);
+
+ return(new seqStore(filename));
+}
+
+
+
+// If this proves far too slow, rewrite the _names string to separate IDs with 0xff, then use
+// strstr on the whole thing. To find the ID, scan down the string counting the number of 0xff's.
+//
+// Similar code is used for fastaFile::find()
+//
+uint32
+seqStore::find(const char *sequencename) {
+
+ if (_names == NULL)
+ loadIndex();
+
+ char *ptr = _names;
+
+ for (uint32 iid=0; iid < _header._numberOfSequences; iid++) {
+ if (strcmp(sequencename, ptr) == 0)
+ return(iid);
+
+ while (*ptr)
+ ptr++;
+ ptr++;
+ }
+
+ return(~uint32ZERO);
+}
+
+
+
+uint32
+seqStore::getSequenceLength(uint32 iid) {
+ if (_index == NULL)
+ loadIndex();
+ return((iid < _header._numberOfSequences) ? _index[iid]._seqLength : 0);
+}
+
+
+
+bool
+seqStore::getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax) {
+
+ if (_index == NULL)
+ loadIndex();
+
+ if (iid >= _header._numberOfSequences) {
+ fprintf(stderr, "seqStore::getSequence(full)-- iid "uint32FMT" more than number of sequences "uint32FMT"\n",
+ iid, _header._numberOfSequences);
+ return(false);
+ }
+
+ if (sMax == 0) s = 0L; // So the delete below doesn't bomb
+ if (hMax == 0) h = 0L;
+
+ if (sMax < _index[iid]._seqLength + 1) {
+ sMax = _index[iid]._seqLength + 1024;
+ delete [] s;
+ s = new char [sMax];
+ }
+
+ if (hMax < _index[iid]._hdrLength + 1) {
+ hMax = _index[iid]._hdrLength + 1024;
+ delete [] h;
+ h = new char [hMax];
+ }
+
+ hLen = 0;
+ sLen = 0;
+
+ // Copy the defline into h
+
+ memcpy(h, _names + _index[iid]._hdrPosition, _index[iid]._hdrLength);
+
+ h[_index[iid]._hdrLength] = 0;
+
+ // Decode and copy the sequence into s
+
+ uint32 seqLen = _index[iid]._seqLength;
+ uint32 block = _index[iid]._block;
+ uint64 seekpos = _index[iid]._seqPosition * 2;
+
+ _bpf->seek(seekpos);
+
+ while (sLen < seqLen) {
+ assert(_bpf->tell() == _block[block]._bpf * 2);
+ assert(sLen == _block[block]._pos);
+
+ if (_block[block]._isACGT == 0) {
+ memset(s + sLen, 'N', _block[block]._len);
+ sLen += _block[block]._len;
+ } else {
+ for (uint32 xx=0; xx<_block[block]._len; xx++) {
+ s[sLen++] = bitsToLetter[_bpf->getBits(2)];
+ }
+ }
+
+ block++;
+ }
+
+ s[sLen] = 0;
+
+ return(true);
+}
+
+
+
+bool
+seqStore::getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s) {
+
+ if (_index == NULL)
+ loadIndex();
+
+ if (iid >= _header._numberOfSequences) {
+ fprintf(stderr, "seqStore::getSequence(part)-- iid "uint32FMT" more than number of sequences "uint32FMT"\n",
+ iid, _header._numberOfSequences);
+ return(false);
+ }
+
+ if (bgn >= end) {
+ fprintf(stderr, "seqStore::getSequence(part)-- for iid "uint32FMT"; invalid bgn="uint32FMT" end="uint32FMT"; seqLen="uint32FMT"\n",
+ iid, bgn, end, _index[iid]._seqLength);
+ return(false);
+ }
+
+ // Decode and copy the sequence into s
+
+ uint32 block = _index[iid]._block;
+ uint32 sLen = 0; // length of sequence we've copied
+ uint32 sPos = 0; // position in the sequence
+
+ // Skip blocks before we care.
+ //
+ while (sPos + _block[block]._len < bgn) {
+ sPos += _block[block]._len;
+ block++;
+ }
+
+ assert(sPos == _block[block]._pos);
+
+ // Move into the block (we could just set sPos = bgn...).
+ sPos += bgn - _block[block]._pos;
+
+ // Handle the partial block. Copy what is left in the block, or
+ // the requested size, whichever is smaller.
+
+ uint32 partLen = MIN((_block[block]._pos + _block[block]._len - bgn),
+ (end - bgn));
+
+ if (_block[block]._isACGT == 0) {
+ memset(s, 'N', partLen);
+ sLen += partLen;
+ _bpf->seek(_block[block+1]._bpf * 2);
+ } else {
+ _bpf->seek((_block[block]._bpf + bgn - _block[block]._pos) * 2);
+
+ for (uint32 xx=0; xx<partLen; xx++)
+ s[sLen++] = bitsToLetter[_bpf->getBits(2)];
+ }
+
+ sPos += partLen;
+
+ block++;
+
+ while (sPos < end) {
+ assert(_bpf->tell() == _block[block]._bpf * 2);
+ assert(sPos == _block[block]._pos);
+
+ // Like the partial block above, pick how much to copy as the
+ // smaller of the block size and what is left to fill.
+
+ partLen = MIN((_block[block]._len), (end - sPos));
+
+ if (_block[block]._isACGT == 0) {
+ memset(s + sLen, 'N', partLen);
+ sLen += partLen;
+ } else {
+ for (uint32 xx=0; xx<partLen; xx++)
+ s[sLen++] = bitsToLetter[_bpf->getBits(2)];
+ }
+
+ sPos += partLen;
+
+ block++;
+ }
+
+ s[sLen] = 0;
+
+ return(true);
+}
+
+
+
+void
+seqStore::clear(void) {
+ memset(_filename, 0, FILENAME_MAX);
+ memset(_typename, 0, FILENAME_MAX);
+
+ strcpy(_typename, "seqStore");
+
+ _numberOfSequences = 0;
+
+ _bpf = 0L;
+
+ memset(&_header, 0, sizeof(seqStoreHeader));
+
+ _index = 0L;
+ _block = 0L;
+ _names = 0L;
+
+ _indexBPF = 0L;
+ _blockBPF = 0L;
+ _namesBPF = 0L;
+
+ _lastIIDloaded = ~uint32ZERO;
+}
+
+
+
+void
+seqStore::loadIndex(void) {
+
+ if (_index)
+ return;
+
+ delete _indexBPF; _indexBPF = 0L;
+ delete _blockBPF; _blockBPF = 0L;
+ delete _namesBPF; _namesBPF = 0L;
+
+ errno = 0;
+ FILE *F = fopen(_filename, "r");
+ if (errno)
+ fprintf(stderr, "seqStore::seqStore()-- Failed to open '%s': %s\n",
+ _filename, strerror(errno)), exit(1);
+
+ fread(&_header, sizeof(seqStoreHeader), 1, F);
+
+ //fprintf(stderr, "seqStore::seqStore()-- Allocating space for "uint32FMT" sequences ("uint64FMT"MB)\n", _header._numberOfSequences, _header._numberOfSequences * sizeof(seqStoreIndex) / 1024 / 1024);
+ //fprintf(stderr, "seqStore::seqStore()-- Allocating space for "uint32FMT" blocks ("uint64FMT"MB)\n", _header._numberOfBlocks, _header._numberOfBlocks * sizeof(seqStoreBlock) / 1024 / 1024);
+ //fprintf(stderr, "seqStore::seqStore()-- Allocating space for "uint32FMT" labels ("uint64FMT"MB)\n", _header._namesLength, _header._namesLength * sizeof(char) / 1024 / 1024);
+
+ _index = new seqStoreIndex [_header._numberOfSequences];
+ _block = new seqStoreBlock [_header._numberOfBlocks];
+ _names = new char [_header._namesLength];
+
+ fseeko(F, _header._indexStart, SEEK_SET);
+ fread( _index, sizeof(seqStoreIndex), _header._numberOfSequences, F);
+
+#if 0
+ for (uint32 i=0; i<_header._numberOfSequences; i++)
+ fprintf(stderr, "IDX[%4u] hdrPos=%u hdrLen=%u seqPos=%llu seqLen=%u block=%u\n",
+ i,
+ _index[i]._hdrPosition,
+ _index[i]._hdrLength,
+ _index[i]._seqPosition,
+ _index[i]._seqLength,
+ _index[i]._block);
+#endif
+
+ fseeko(F, _header._blockStart, SEEK_SET);
+ fread( _block, sizeof(seqStoreBlock), _header._numberOfBlocks, F);
+
+ fseeko(F, _header._namesStart, SEEK_SET);
+
+ fread( _names, sizeof(char), _header._namesLength, F);
+ if (errno)
+ fprintf(stderr, "seqStore::seqStore()-- Failed to read index from '%s': %s\n",
+ _filename, strerror(errno)), exit(1);
+
+ fclose(F);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+static
+void
+addSeqStoreBlock(uint32 &BLOKmax,
+ uint32 &BLOKlen,
+ seqStoreBlock* &BLOK,
+ seqStoreBlock &b,
+ uint32 &nBlockACGT,
+ uint32 &nBlockGAP,
+ uint64 &nACGT) {
+
+ //fprintf(stderr, "addSeqStoreBlock()-- BLOK max=%u len=%u ACGT=%u GAP=%u nACGT=%lu\n",
+ // BLOKmax, BLOKlen, nBlockACGT, nBlockGAP, nACGT);
+
+ if (b._len == 0)
+ return;
+
+ if (b._isACGT == 1) {
+ nBlockACGT++;
+ nACGT += b._len;
+ } else {
+ nBlockGAP++;
+ }
+
+ BLOK[BLOKlen++] = b;
+
+ if (BLOKlen >= BLOKmax) {
+ BLOKmax *= 2;
+ seqStoreBlock *nb = new seqStoreBlock [BLOKmax];
+ memcpy(nb, BLOK, BLOKlen * sizeof(seqStoreBlock));
+ delete [] BLOK;
+ BLOK = nb;
+ }
+}
+
+
+
+void
+constructSeqStore(char *filename, seqCache *inputseq) {
+
+ fprintf(stderr, "constructSeqStore()-- constructing seqStore '%s' from seqCache '%s' of type '%s'.\n",
+ filename, inputseq->getSourceName(), inputseq->getFileTypeName());
+
+ seqStoreHeader HEAD;
+ memset(&HEAD, sizeof(seqStoreHeader), 0);
+
+ bitPackedFile *DATA = new bitPackedFile(filename, sizeof(seqStoreHeader), true);
+
+ uint32 INDXmax = 1048576;
+ seqStoreIndex *INDX = new seqStoreIndex [INDXmax];
+
+ uint32 BLOKmax = 1048576;
+ uint32 BLOKlen = 0;
+ seqStoreBlock *BLOK = new seqStoreBlock [BLOKmax];
+
+ uint32 NAMEmax = 32 * 1024 * 1024;
+ uint32 NAMElen = 0;
+ char *NAME = new char [NAMEmax];
+
+ seqInCore *sic = inputseq->getSequenceInCore();
+
+ uint64 nACGT = 0;
+ uint32 nBlockACGT = 0;
+ uint32 nBlockGAP = 0;
+ uint32 nSequences = 0;
+
+ speedCounter C(" reading sequences %7.0f sequences -- %5.0f sequences/second\r", 1.0, 0x1ffff, true);
+
+ while (sic != NULL) {
+ if (sic->sequence()) {
+ char *seq = sic->sequence();
+ seqStoreBlock b;
+
+ if (nSequences >= INDXmax) {
+ seqStoreIndex *I = new seqStoreIndex[INDXmax * 2];
+ memcpy(I, INDX, sizeof(seqStoreIndex) * nSequences);
+ delete [] INDX;
+ INDXmax *= 2;
+ INDX = I;
+ }
+
+ INDX[nSequences]._hdrPosition = NAMElen;
+ INDX[nSequences]._hdrLength = sic->headerLength();
+ INDX[nSequences]._seqPosition = DATA->tell() / 2;
+ INDX[nSequences]._seqLength = sic->sequenceLength();
+ INDX[nSequences]._block = BLOKlen;
+
+#if 0
+ fprintf(stderr, "ADD SEQUENCE hdr pos=%u len=%u seq pos=%u len=%u blok=%u\n",
+ INDX[nSequences]._hdrPosition,
+ INDX[nSequences]._hdrLength,
+ INDX[nSequences]._seqPosition,
+ INDX[nSequences]._seqLength,
+ INDX[nSequences]._block);
+#endif
+
+ if (sic->sequenceLength() > SEQSTOREBLOCK_MAXPOS)
+ fprintf(stderr, "constructSeqStore()-- sequence %s too long, must be shorter than "uint64FMT" Gbp.\n",
+ sic->header(), SEQSTOREBLOCK_MAXPOS / 1024 / 1024 / 1024), exit(1);
+
+ if (sic->getIID() > SEQSTOREBLOCK_MAXPOS)
+ fprintf(stderr, "constructSeqStore()-- too many sequences, must be fewer than "uint64FMT".\n",
+ SEQSTOREBLOCK_MAXIID), exit(1);
+
+ if (NAMElen + sic->headerLength() + 1 > NAMEmax) {
+ NAMEmax += 32 * 1024 * 1024;
+ char *nm = new char [NAMEmax];
+ memcpy(nm, NAME, sizeof(char) * NAMElen);
+ delete [] NAME;
+ NAME = nm;
+ }
+ strcpy(NAME + NAMElen, sic->header());
+ NAMElen += sic->headerLength() + 1;
+
+ b._isACGT = 0;
+ b._iid = sic->getIID();
+ b._pos = 0;
+ b._len = 0;
+ b._bpf = DATA->tell() / 2;
+
+ for (uint32 p=0; p<sic->sequenceLength(); p++) {
+ uint64 bits = letterToBits[seq[p]];
+
+ // If the length of the current block is too big (which would
+ // soon overflow the bit field storing length) write out a
+ // block and reset the length.
+ //
+ if (b._len == SEQSTOREBLOCK_MAXLEN) {
+ addSeqStoreBlock(BLOKmax, BLOKlen, BLOK, b, nBlockACGT, nBlockGAP, nACGT);
+
+ b._pos = p;
+ b._len = 0;
+ b._bpf = DATA->tell() / 2;
+ }
+
+
+ if (bits == 0xff) {
+ // This letter is NOT ACGT. If the current block is an ACGT block, write it
+ // and reset.
+ //
+ if (b._isACGT == 1) {
+ addSeqStoreBlock(BLOKmax, BLOKlen, BLOK, b, nBlockACGT, nBlockGAP, nACGT);
+
+ b._isACGT = 0;
+ b._iid = sic->getIID();
+ b._pos = p;
+ b._len = 0;
+ b._bpf = DATA->tell() / 2;
+ }
+
+ } else {
+
+ // This letter is ACGT. If the current block is NOT an ACGT block, write it
+ // and reset.
+ //
+ if (b._isACGT == 0) {
+ addSeqStoreBlock(BLOKmax, BLOKlen, BLOK, b, nBlockACGT, nBlockGAP, nACGT);
+
+ b._isACGT = 1;
+ b._iid = sic->getIID();
+ b._pos = p;
+ b._len = 0;
+ b._bpf = DATA->tell() / 2;
+ }
+ }
+
+ // Always add one to the length of the current block, and
+ // write out the base if the letter is ACGT.
+ //
+ b._len++;
+
+ if (bits != 0xff)
+ DATA->putBits(bits, 2);
+ }
+
+ // Emit the last block
+ //
+ addSeqStoreBlock(BLOKmax, BLOKlen, BLOK, b, nBlockACGT, nBlockGAP, nACGT);
+ }
+
+ // If there is no sequence, the index record for this sequence is left blank.
+ //
+ nSequences++;
+
+ C.tick();
+
+ delete sic;
+ sic = inputseq->getSequenceInCore();
+ }
+
+ // And a sentinel EOF block -- gets the last position in the file,
+ // useful for the binary search. We always have a space block at
+ // the end of the list, but we don't care if we just used the last
+ // block (and so we don't bother to reallocate the array if it is
+ // full).
+
+ BLOK[BLOKlen]._isACGT = 0;
+ BLOK[BLOKlen]._iid = uint32MASK(32);
+ BLOK[BLOKlen]._pos = uint32MASK(31);
+ BLOK[BLOKlen]._len = 0;
+ BLOK[BLOKlen]._bpf = DATA->tell() / 2;
+
+ BLOKlen++;
+
+ // Update the header, assemble the final file.
+
+ delete DATA;
+
+ HEAD._magic[0] = SEQSTORE_MAGICNUMBER1;
+ HEAD._magic[1] = SEQSTORE_MAGICNUMBER2;
+ HEAD._pad = uint32ZERO;
+ HEAD._numberOfSequences = nSequences;
+ HEAD._numberOfACGT = nACGT;
+ HEAD._numberOfBlocksACGT = nBlockACGT;
+ HEAD._numberOfBlocksGAP = nBlockGAP;
+ HEAD._numberOfBlocks = BLOKlen;
+ HEAD._namesLength = NAMElen;
+ HEAD._indexStart = uint64ZERO;
+ HEAD._blockStart = uint64ZERO;
+ HEAD._namesStart = uint64ZERO;
+
+ errno = 0;
+ FILE *F = fopen(filename, "r+");
+ if (errno)
+ fprintf(stderr, "constructSeqStore()-- Failed to reopen '%s' to write data: %s\n",
+ filename, strerror(errno)), exit(1);
+
+ fseeko(F, 0, SEEK_END);
+ HEAD._indexStart = ftello(F);
+ fwrite(INDX, sizeof(seqStoreIndex), HEAD._numberOfSequences, F);
+
+ fseeko(F, 0, SEEK_END);
+ HEAD._blockStart = ftello(F);
+ fwrite(BLOK, sizeof(seqStoreBlock), HEAD._numberOfBlocks, F);
+
+ fseeko(F, 0, SEEK_END);
+ HEAD._namesStart = ftello(F);
+ fwrite(NAME, sizeof(char), HEAD._namesLength, F);
+
+ fseeko(F, 0, SEEK_SET);
+ fwrite(&HEAD, sizeof(seqStoreHeader), 1, F);
+
+ fclose(F);
+
+ if (errno)
+ fprintf(stderr, "constructSeqStore()-- Failed to write data to '%s': %s\n",
+ filename, strerror(errno)), exit(1);
+
+ delete [] INDX;
+ delete [] BLOK;
+ delete [] NAME;
+
+ // ESTmapper depends on this output.
+
+ fprintf(stderr, "constructSeqStore()-- seqStore '%s' constructed ("uint32FMT" sequences, "uint64FMT" ACGT letters, "uint32FMT" ACGT blocks, "uint32FMT" GAP blocks).\n",
+ filename, HEAD._numberOfSequences, HEAD._numberOfACGT, HEAD._numberOfBlocksACGT, HEAD._numberOfBlocksGAP);
+}
diff --git a/libseq/seqStore.H b/libseq/seqStore.H
new file mode 100644
index 0000000..90771ab
--- /dev/null
+++ b/libseq/seqStore.H
@@ -0,0 +1,120 @@
+#ifndef SEQSTORE_H
+#define SEQSTORE_H
+
+#include "util++.H"
+#include "seqCache.H"
+
+// A binary fasta file.
+//
+// HEADER
+// magic number
+// number of sequences
+// optional - alphabet size
+// optional - alphabet map (0x00 -> 'a', etc)
+// position of index start
+// position of data start
+// DATA
+// INDEX
+// position of sequence start in DATA
+// header length
+// sequence length
+// MAP
+// name to IID mapping
+
+struct seqStoreHeader {
+ uint64 _magic[2];
+ uint32 _pad;
+ uint32 _numberOfSequences;
+ uint64 _numberOfACGT;
+ uint32 _numberOfBlocksACGT;
+ uint32 _numberOfBlocksGAP;
+ uint32 _numberOfBlocks;
+ uint32 _namesLength;
+
+ uint64 _indexStart;
+ uint64 _blockStart;
+ uint64 _namesStart;
+};
+
+
+// This index allows us to return a complete sequence
+//
+struct seqStoreIndex {
+ uint32 _hdrPosition; // Offset into _names for the defline
+ uint32 _hdrLength; // Length of the defline
+ uint64 _seqPosition; // Offset into _bpf for the sequence data
+ uint32 _seqLength; // Length, in bases, of the sequence
+ uint32 _block; // The seqStoreBlock that starts this sequence
+};
+
+
+// This index allows us to seek to a specific base in the
+// file of sequences. Each block is either:
+// ACGT - and has data
+// N - no data
+// It will map a specific ACGT location to the sequence, and the ID
+// of that sequence (seq ID and location in that sequence).
+//
+struct seqStoreBlock {
+ uint64 _isACGT:1; // block is acgt
+ uint64 _pos:32; // position in sequence
+ uint64 _iid:32; // iid of the sequence we are in
+ uint64 _len:23; // length of block
+ uint64 _bpf:40; // position in the bit file of sequence
+};
+
+#define SEQSTOREBLOCK_MAXPOS uint64MASK(32)
+#define SEQSTOREBLOCK_MAXIID uint64MASK(32)
+#define SEQSTOREBLOCK_MAXLEN uint64MASK(23)
+
+class seqStore : public seqFile {
+protected:
+ seqStore(const char *filename);
+ seqStore();
+
+public:
+ ~seqStore();
+
+protected:
+ seqFile *openFile(const char *filename);
+
+public:
+ uint32 find(const char *sequencename);
+
+ uint32 getSequenceLength(uint32 iid);
+ bool getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax);
+ bool getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s);
+
+private:
+ void clear(void);
+ void loadIndex(void);
+
+ bitPackedFile *_bpf;
+
+ seqStoreHeader _header;
+
+ seqStoreIndex *_index;
+ seqStoreBlock *_block;
+ char *_names;
+
+ bitPackedFile *_indexBPF;
+ bitPackedFile *_blockBPF;
+ bitPackedFile *_namesBPF;
+
+ uint32 _lastIIDloaded;
+
+ friend class seqFactory;
+};
+
+
+// Construct a new seqStore 'filename' from input file 'inputseq'.
+//
+void
+constructSeqStore(char *filename,
+ seqCache *inputseq);
+
+
+#endif // SEQSTORE_H
diff --git a/libseq/seqStream.C b/libseq/seqStream.C
new file mode 100644
index 0000000..476d239
--- /dev/null
+++ b/libseq/seqStream.C
@@ -0,0 +1,396 @@
+#include "seqFactory.H"
+#include "seqStream.H"
+
+
+seqStream::seqStream(const char *filename) {
+ _file = openSeqFile(filename);
+ _string = 0L;
+
+ _currentIdx = 0;
+ _currentPos = 0;
+ _streamPos = 0;
+
+ _bufferMax = 1048576;
+ _bufferLen = 0;
+ _bufferPos = 0;
+ _bufferSep = 0;
+ _buffer = new char [_bufferMax + 1];
+
+ _idxLen = _file->getNumberOfSequences();
+ _idx = new seqStreamIndex [_idxLen + 1];
+
+ //fprintf(stderr, "seqStream::seqStream()-- Allocating "uint64FMT"MB for seqStreamIndex on "uint64FMT" sequences.\n",
+ // _idxLen * sizeof(seqStreamIndex) / 1024 / 1024, _idxLen);
+
+ _seqNumOfPos = 0L;
+
+ _lengthOfSequences = 0;
+
+ _eof = false;
+
+ _separator = '.';
+ _separatorLength = 2;
+
+ setSeparator('.', 2);
+
+ _bgn = 0;
+ _end = _lengthOfSequences;
+}
+
+
+
+seqStream::seqStream(const char *sequence, uint32 length) {
+ _file = 0L;
+ _string = (char *)sequence;
+
+ _currentIdx = 0;
+ _currentPos = 0;
+ _streamPos = 0;
+
+ _bufferMax = length;
+ _bufferLen = length;
+ _bufferPos = 0;
+ _bufferSep = 0;
+ _buffer = _string;
+
+ _idxLen = 1;
+ _idx = new seqStreamIndex [_idxLen + 1];
+
+ _seqNumOfPos = 0L;
+
+ _idx[0]._iid = 0;
+ _idx[0]._len = length;
+ _idx[0]._bgn = 0;
+
+ _idx[1]._iid = ~uint32ZERO;
+ _idx[1]._len = 0;
+ _idx[1]._bgn = length;
+
+ _lengthOfSequences = length;
+
+ _eof = false;
+
+ _separator = '.';
+ _separatorLength = 20;
+
+ _bgn = 0;
+ _end = length;
+}
+
+
+
+seqStream::~seqStream() {
+ if (_file) {
+ delete _file;
+ delete [] _buffer;
+ }
+ delete [] _idx;
+ delete [] _seqNumOfPos;
+}
+
+
+
+void
+seqStream::setSeparator(char sep, uint32 len) {
+
+ // Special case; no separator needed for string backed sequences.
+ if (_string)
+ return;
+
+ // Bizarre signedness issue with sep=255
+ // ST->get() == sep FAILS
+ // x=ST->get(); x == sep SUCCEEDS
+ //
+ // Not suggested to use non-printable ascii.
+
+ if ((isprint(sep) == 0) || (tolower(sep) == 'a') || (tolower(sep) == 'c') || (tolower(sep) == 'g') || (tolower(sep) == 't')) {
+ fprintf(stderr, "seqStream::setSeparator()-- ERROR! Separator letter must be printable ASCII and not [ACGTacgt].\n");
+ exit(1);
+ }
+ if (len == 0) {
+ fprintf(stderr, "seqStream::setSeparator()-- ERROR! Separator length cannot be zero.\n");
+ exit(1);
+ }
+
+ _lengthOfSequences = 0;
+
+ _separator = sep;
+ _separatorLength = len;;
+
+ for (uint32 s=0; s<_idxLen; s++) {
+ _idx[s]._iid = s;
+ _idx[s]._len = _file->getSequenceLength(s);
+ _idx[s]._bgn = _lengthOfSequences;
+
+ _lengthOfSequences += _idx[s]._len;
+ }
+
+ _idx[_idxLen]._iid = ~uint32ZERO;
+ _idx[_idxLen]._len = 0;
+ _idx[_idxLen]._bgn = _lengthOfSequences;
+
+ // Rebuild our sequence number of position map, if it exists.
+ //
+ if (_seqNumOfPos) {
+ delete [] _seqNumOfPos;
+ tradeSpaceForTime();
+ }
+}
+
+
+
+void
+seqStream::tradeSpaceForTime(void) {
+ uint32 i = 0;
+ uint32 s = 0;
+
+ //fprintf(stderr, "Allocating "uint32FMT" uint32s for seqNumOfPos.\n", _lengthOfSequences);
+
+ _seqNumOfPos = new uint32 [_lengthOfSequences];
+
+ for (i=0; i<_lengthOfSequences; i++) {
+
+ // Increment the sequence number until we enter into the next
+ // sequence. Zero length sequences require the use of a 'while'
+ // here.
+ //
+ while (i >= _idx[s+1]._bgn)
+ s++;
+
+ _seqNumOfPos[i] = s;
+ }
+}
+
+
+
+unsigned char
+seqStream::get(void) {
+ if (_streamPos >= _end)
+ _eof = true;
+ if ((_eof == false) && (_bufferPos >= _bufferLen))
+ fillBuffer();
+ if (_eof)
+ return(0);
+ if (_bufferSep == 0) {
+ _currentPos++;
+ _streamPos++;
+ } else {
+ _bufferSep--;
+ }
+ return(_buffer[_bufferPos++]);
+}
+
+
+
+void
+seqStream::rewind(void){
+
+ // Search for the correct spot. Uncommon operation, be inefficient
+ // but simple. The range was checked to be good by setRange().
+
+ uint32 s = 0;
+ uint64 l = 0;
+
+ while ((s < _idxLen) && (l + _idx[s]._len < _bgn))
+ l += _idx[s++]._len;
+
+ _eof = false;
+
+ // (_bgn - l) is a 32-bit quanitity because of the second half of
+ // the while above. Although _bgn is a 64-bit value, the value
+ // used to set _bufferPos will be for that of a string constructor,
+ // and so _bgn will be 32-bits. fillBuffer() resets _bufferPos if
+ // we're backed by a file.
+
+ _currentIdx = s;
+ _currentPos = _bgn - l;
+ _streamPos = _bgn;
+ _bufferPos = _bgn;
+
+ //fprintf(stderr, "seqStream::rewind()-- 1 currentIdx="uint32FMT" currentPos="uint32FMT" streamPos="uint32FMT" bufferPos="uint32FMT"\n",
+ // _currentIdx, _currentPos, _streamPos, _bufferPos);
+
+ fillBuffer();
+
+ //fprintf(stderr, "seqStream::rewind()-- 2 currentIdx="uint32FMT" currentPos="uint32FMT" streamPos="uint32FMT" bufferPos="uint32FMT"\n",
+ // _currentIdx, _currentPos, _streamPos, _bufferPos);
+}
+
+
+
+void
+seqStream::setRange(uint64 bgn, uint64 end) {
+
+ assert(bgn < end);
+
+ uint32 s = 0;
+ uint64 l = 0;
+
+ while (s < _idxLen)
+ l += _idx[s++]._len;
+
+ if (end == ~uint64ZERO)
+ end = l;
+
+ if ((bgn > l) || (end > l))
+ fprintf(stderr, "seqStream::setRange()-- ERROR: range ("uint64FMT","uint64FMT") too big; only "uint64FMT" positions.\n",
+ bgn, end, l), exit(1);
+
+ _bgn = bgn;
+ _end = end;
+
+ rewind();
+}
+
+
+void
+seqStream::setPosition(uint64 pos) {
+
+ assert(_bgn <= pos);
+ assert( pos < _end);
+
+ uint64 old = _bgn;
+
+ _bgn = pos;
+ rewind();
+ _bgn = old;
+}
+
+
+uint32
+seqStream::sequenceNumberOfPosition(uint64 p) {
+ uint32 s = ~uint32ZERO;
+
+ // binary search on our list of start positions, to find the
+ // sequence that p is in.
+
+ if (_lengthOfSequences <= p) {
+ fprintf(stderr, "seqStream::sequenceNumberOfPosition()-- WARNING: position p="uint64FMT" too big; only "uint64FMT" positions.\n",
+ p, _lengthOfSequences);
+ return(s);
+ }
+
+ if (_seqNumOfPos)
+ return(_seqNumOfPos[p]);
+
+ if (_idxLen < 16) {
+ for (s=0; s<_idxLen; s++)
+ if ((_idx[s]._bgn <= p) && (p < _idx[s+1]._bgn))
+ break;
+ } else {
+ uint32 lo = 0;
+ uint32 hi = _idxLen;
+ uint32 md = 0;
+
+ while (lo <= hi) {
+ md = (lo + hi) / 2;
+
+ if (p < _idx[md]._bgn) {
+ // This block starts after the one we're looking for.
+ hi = md;
+
+ } else if ((_idx[md]._bgn <= p) && (p < _idx[md+1]._bgn)) {
+ // Got it!
+ lo = md + 1;
+ hi = md;
+ s = md;
+
+ } else {
+ // By default, then, the block is too low.
+ lo = md;
+ }
+ }
+ }
+
+ return(s);
+}
+
+
+
+void
+seqStream::fillBuffer(void) {
+
+ // Special case for when we're backed by a character string; there
+ // is no need to fill the buffer.
+ //
+ if (_file == 0L) {
+ if (_currentPos >= _end)
+ _eof = true;
+ return;
+ }
+
+ // Read bytes from the _file, stuff them into the buffer. Assumes
+ // there is nothing in the buffer to save.
+
+ _bufferLen = 0;
+ _bufferPos = 0;
+
+ // Still more stuff in the sequence? Get it.
+
+ if (_currentPos < _idx[_currentIdx]._len) {
+#ifdef DEBUG
+ fprintf(stderr, "seqStream::fillBuffer()-- More Seq currentPos="uint32FMT" len="uint32FMT"\n", _currentPos, _idx[_currentIdx]._len);
+#endif
+ _bufferLen = MIN(_idx[_currentIdx]._len - _currentPos, _bufferMax);
+
+ if (_file->getSequence(_idx[_currentIdx]._iid,
+ _currentPos,
+ _currentPos + _bufferLen,
+ _buffer) == false)
+ fprintf(stderr, "seqStream::fillBuffer()-- Failed to getSequence(part) #1 iid="uint32FMT" bgn="uint32FMT" end="uint32FMT"\n",
+ _idx[_currentIdx]._iid, _currentPos, _currentPos + _bufferLen), exit(1);
+
+ return;
+ }
+
+ // We've finished a sequence. Load the next.
+
+ _currentPos = 0;
+ _currentIdx++;
+
+ while ((_currentIdx < _idxLen) && (_idx[_currentIdx]._len == 0))
+ _currentIdx++;
+
+#ifdef DEBUG
+ fprintf(stderr, "seqStream::fillBuffer()-- New Seq currentPos="uint32FMT" len="uint32FMT"\n", _currentPos, _idx[_currentIdx]._len);
+#endif
+
+ // All done if there is no more sequence.
+
+ if (_currentIdx >= _idxLen) {
+ _eof = true;
+ return;
+ }
+
+ // Insert a separator.
+
+ for (_bufferLen = 0; _bufferLen < _separatorLength; _bufferLen++)
+ _buffer[_bufferLen] = _separator;
+
+ // Keep track of the separator - this is used to make sure we don't
+ // advance the sequence/stream position while the separator is
+ // being returned.
+ //
+ _bufferSep = _bufferLen;
+
+ // How much to get; minimum of what is left in the sequence, and
+ // the buffer size. Don't forget about the separator we already
+ // inserted!
+ //
+ uint32 bl = MIN(_idx[_currentIdx]._len - _currentPos, _bufferMax - _bufferLen);
+
+ if (_file->getSequence(_idx[_currentIdx]._iid,
+ _currentPos,
+ _currentPos + bl,
+ _buffer + _bufferLen) == false)
+ fprintf(stderr, "seqStream::fillBuffer()-- Failed to getSequence(part) #2 iid="uint32FMT" bgn="uint32FMT" end="uint32FMT"\n",
+ _idx[_currentIdx]._iid, _currentPos, _currentPos + bl), exit(1);
+
+ _bufferLen += bl;
+
+ // Load more, until buffer is full. Not really needed, and won't
+ // improve performance much. AND it adds a lot of complexity to
+ // track which sequence is current (_currentIdx).
+
+ return;
+}
diff --git a/libseq/seqStream.H b/libseq/seqStream.H
new file mode 100644
index 0000000..7ab6092
--- /dev/null
+++ b/libseq/seqStream.H
@@ -0,0 +1,124 @@
+#ifndef SEQSTREAM_H
+#define SEQSTREAM_H
+
+#include "util++.H"
+#include "bio++.H"
+
+#include "seqFile.H"
+
+struct seqStreamIndex {
+ uint32 _iid; // seqFile IID
+ uint32 _len; // length of the sequence
+ uint64 _bgn; // begin position in the stream
+};
+
+
+class seqStream {
+public:
+ seqStream(const char *filename);
+ seqStream(const char *sequence, uint32 length);
+ ~seqStream();
+
+ // Separate sequences with this letter. Non-ACGT is always
+ // returned as 'N'. Changing the length of the separator AFTER
+ // setting the range will result in the wrong range being used.
+ //
+ void setSeparator(char sep, uint32 len);
+
+ // get() returns one letter per input letter -- a gap of size n
+ // will return n gap symbols.
+ //
+ unsigned char get(void);
+ bool eof(void) { return(_eof); };
+
+ // Returns to the start of the range.
+ //
+ void rewind(void);
+
+ // Set the range of ACGT sequence we will return. Coordinates are
+ // space-based. Example:
+ //
+ // >one
+ // AAA
+ // >two
+ // C
+ // >three
+ // GGG
+ //
+ // We separate these sequences with three '-' letters.
+ //
+ // strPos 012...3...456
+ // AAA---C---GGG
+ //
+ // range(0,0) -> nothing
+ // range(0,1) -> A
+ // range(0,3) -> AAA
+ // range(0,4) -> AAAnnnC
+ // range(0,5) -> AAAnnnCnnnG
+ //
+ void setRange(uint64 bgn, uint64 end);
+ void setPosition(uint64 pos);
+
+ // seqPos() is the position we are at in the current sequence;
+ // seqIID() is the iid of that sequence;
+ // strPos() is the position we are at in the chained sequence
+ //
+ // Values are not defined if the letter is a separator.
+ //
+ uint32 seqPos(void) { return(_currentPos); };
+ uint32 seqIID(void) { return(_idx[_currentIdx]._iid); };
+ uint64 strPos(void) { return(_streamPos); };
+
+ uint32 numberOfSequences(void) { return(_idxLen); };
+
+ // Return the length of, position of (in the chain) and IID of the
+ // (s)th sequence in the chain.
+ //
+ uint32 lengthOf(uint32 s) { return((s >= _idxLen) ? ~uint32ZERO : _idx[s]._len); };
+ uint32 IIDOf(uint32 s) { return((s >= _idxLen) ? ~uint32ZERO : _idx[s]._iid); };
+ uint64 startOf(uint32 s) { return((s >= _idxLen) ? ~uint64ZERO : _idx[s]._bgn); };
+
+ // For a chain position p, returns the s (above) for that position.
+ //
+ uint32 sequenceNumberOfPosition(uint64 p);
+ void tradeSpaceForTime(void);
+
+private:
+ void fillBuffer(void);
+
+ seqFile *_file; // Backed by a seqFile.
+ char *_string; // Backed by a character string.
+
+ uint64 _bgn; // Begin/End position in chained sequence
+ uint64 _end;
+
+ uint32 _currentIdx; // index into _idx of the current sequence
+ uint32 _currentPos; // position in the current sequence
+ uint64 _streamPos; // position in the chained sequence
+
+ // Buffer for holding sequence from the seqFile.
+
+ uint32 _bufferMax; // Max size of the buffer
+ uint32 _bufferLen; // Actual size of the buffer
+ uint32 _bufferPos; // Position we are at in the buffer
+ uint32 _bufferSep; // How much of the buffer is separator
+ char *_buffer;
+
+ // Info about the raw sequences
+
+ uint32 _idxLen;
+ seqStreamIndex *_idx;
+
+ uint32 *_seqNumOfPos;
+
+ uint64 _lengthOfSequences;
+
+ bool _eof;
+
+ char _separator;
+ uint32 _separatorLength;
+};
+
+
+
+#endif // SEQSTREAM_H
diff --git a/libseq/sffFile.C b/libseq/sffFile.C
new file mode 100644
index 0000000..e22b52a
--- /dev/null
+++ b/libseq/sffFile.C
@@ -0,0 +1,208 @@
+#include "sffFile.H"
+
+// Lots of ths came from AS_GKP_sff.c
+
+
+
+sffFile::sffFile() {
+ clear();
+}
+
+sffFile::sffFile(const char *name) {
+
+ clear();
+
+ strcpy(_filename, name);
+
+ _rb = new readBuffer(name);
+
+ _rb->read(&_header, 31);
+
+ if (_header.magic_number != 0x2e736666) {
+ _header.swap_endianess = 1;
+ _header.magic_number = uint32Swap(_header.magic_number);
+ _header.index_offset = uint64Swap(_header.index_offset);
+ _header.index_length = uint32Swap(_header.index_length);
+ _header.number_of_reads = uint32Swap(_header.number_of_reads);
+ _header.header_length = uint16Swap(_header.header_length);
+ _header.key_length = uint16Swap(_header.key_length);
+ _header.number_of_flows_per_read = uint16Swap(_header.number_of_flows_per_read);
+ }
+
+ assert(_header.magic_number == 0x2e736666);
+ assert(_header.number_of_flows_per_read < SFF_NUMBER_OF_FLOWS_MAX);
+ assert(_header.key_length < SFF_KEY_SEQUENCE_MAX);
+
+ _rb->read(_header.flow_chars, sizeof(char) * _header.number_of_flows_per_read);
+ _rb->read(_header.key_sequence, sizeof(char) * _header.key_length);
+
+ _firstReadLocation = _header.header_length;
+
+ // The spec says the index might be here, however, all files I've
+ // seen have the index at the end of the file.
+ //
+ if ((_header.index_length > 0) && (_header.index_offset == _header.header_length))
+ _firstReadLocation += _header.index_length;
+
+ // Index
+ //
+ _index = new sffIndex [_header.number_of_reads];
+
+
+ for (uint64 i=0; i<_header.number_of_reads; i++) {
+ uint64 pos = _rb->tell();
+
+ _rb->read(&_read, 16);
+
+ if (_header.swap_endianess) {
+ _read.read_header_length = uint16Swap(_read.read_header_length);
+ _read.name_length = uint16Swap(_read.name_length);
+ _read.number_of_bases = uint32Swap(_read.number_of_bases);
+ }
+
+ _index[i]._seqPos = pos;
+ _index[i]._seqLen = _read.number_of_bases;
+ _index[i]._namLen = _read.name_length;
+
+ pos += _read.read_header_length;
+ pos += sizeof(uint16) * _header.number_of_flows_per_read;
+ pos += sizeof(uint8) * _read.number_of_bases;
+ pos += sizeof(char) * _read.number_of_bases;
+ pos += sizeof(uint8) * _read.number_of_bases;
+
+ pos += (_header.number_of_flows_per_read * sizeof(uint16) +
+ _read.number_of_bases * sizeof(uint8) +
+ _read.number_of_bases * sizeof(char) +
+ _read.number_of_bases * sizeof(uint8)) % 8;
+
+ _rb->seek(pos);
+ }
+ //
+ // Index
+
+ _rb->seek(_firstReadLocation);
+
+ _numberOfSequences = _header.number_of_reads;
+}
+
+sffFile::~sffFile() {
+ delete _rb;
+ delete [] _index;
+}
+
+////////////////////////////////////////
+
+seqFile *
+sffFile::openFile(const char *name) {
+ struct stat st;
+
+ // Open the file, return if it matches the SFF magic_number.
+
+ errno = 0;
+ stat(name, &st);
+ if (errno)
+ return(0L);
+ if ((st.st_mode & S_IFREG) == 0)
+ return(0L);
+
+ FILE *F = fopen(name, "r");
+ if (errno) {
+ fprintf(stderr, "sffFile::openFile()- failed to open '%s': %s\n", name, strerror(errno));
+ return(0L);
+ }
+
+ uint32 magic_number = 0;
+ safeRead(fileno(F), &magic_number, "sff magic_number", sizeof(uint32));
+
+ fclose(F);
+
+ if ((magic_number == 0x2e736666) || (uint32Swap(magic_number) == 0x2e736666))
+ return(new sffFile(name));
+
+ return(0L);
+}
+
+
+
+bool
+sffFile::getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax) {
+
+ if (iid > _header.number_of_reads)
+ return(false);
+
+ memset(&_read, 0, sizeof(sffRead));
+
+ _rb->seek(_index[iid]._seqPos);
+
+ _rb->read(&_read, 16);
+
+ if (_header.swap_endianess) {
+ _read.read_header_length = uint16Swap(_read.read_header_length);
+ _read.name_length = uint16Swap(_read.name_length);
+ _read.number_of_bases = uint32Swap(_read.number_of_bases);
+ _read.clip_quality_left = uint16Swap(_read.clip_quality_left);
+ _read.clip_quality_right = uint16Swap(_read.clip_quality_right);
+ _read.clip_adapter_left = uint16Swap(_read.clip_adapter_left);
+ _read.clip_adapter_right = uint16Swap(_read.clip_adapter_right);
+ }
+
+ assert(_read.read_header_length < SFF_NAME_LENGTH_MAX);
+ assert(_read.number_of_bases < SFF_NUMBER_OF_BASES_MAX);
+
+ _rb->read(_read.name, sizeof(char) * _read.name_length);
+ _read.name[_read.name_length] = 0;
+
+ uint64 pos = _rb->tell();
+
+ pos += _read.read_header_length;
+ pos += sizeof(uint16) * _header.number_of_flows_per_read;
+ pos += sizeof(uint8) * _read.number_of_bases;
+
+ _rb->seek(pos);
+
+ _rb->read(_read.bases, sizeof(char) * _read.number_of_bases);
+ _read.bases[_read.number_of_bases] = 0;
+
+ return(true);
+}
+
+
+
+
+
+bool
+sffFile::getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s) {
+
+ if (iid > _header.number_of_reads)
+ return(false);
+
+ // Same as above, mostly.
+
+ return(false);
+}
+
+
+
+void
+sffFile::clear(void) {
+
+ memset(_filename, 0, FILENAME_MAX);
+ memset(_typename, 0, FILENAME_MAX);
+
+ strcpy(_typename, "SFF");
+
+ _numberOfSequences = 0;
+
+ _rb = 0L;
+
+ memset(&_header, 0, sizeof(sffHeader));
+ memset(&_read, 0, sizeof(sffRead));
+
+ _index = 0L;
+
+ _firstReadLocation = 0;
+ _readIID = 0;
+}
diff --git a/libseq/sffFile.H b/libseq/sffFile.H
new file mode 100644
index 0000000..71e49b6
--- /dev/null
+++ b/libseq/sffFile.H
@@ -0,0 +1,104 @@
+#ifndef SFF_H
+#define SFF_H
+
+#include "util++.H"
+#include "bio++.H"
+
+#include "seqFile.H"
+
+#define SFF_KEY_SEQUENCE_MAX 64
+
+#define SFF_NAME_LENGTH_MAX 256
+#define SFF_NUMBER_OF_FLOWS_MAX 512
+#define SFF_NUMBER_OF_BASES_MAX 2048 // The assembler itself cannot handle longer
+
+
+struct sffHeader {
+ // The next block is read in one swoop from the sff file. DO NOT MODIFY!
+ uint32 magic_number;
+ char version[4];
+ uint64 index_offset;
+ uint32 index_length;
+ uint32 number_of_reads;
+ uint16 header_length;
+ uint16 key_length;
+ uint16 number_of_flows_per_read;
+ uint8 flowgram_format_code;
+
+ char flow_chars[SFF_NUMBER_OF_FLOWS_MAX]; // h->number_of_flows_per_read
+ char key_sequence[SFF_KEY_SEQUENCE_MAX]; // h->key_length
+
+ uint32 swap_endianess;
+};
+
+
+struct sffRead {
+ // The next block is read in one swoop from the sff file. DO NOT MODIFY!
+ uint16 read_header_length;
+ uint16 name_length;
+ uint32 number_of_bases;
+ uint16 clip_quality_left;
+ uint16 clip_quality_right;
+ uint16 clip_adapter_left;
+ uint16 clip_adapter_right;
+
+ char name[SFF_NAME_LENGTH_MAX]; // r->name_length
+
+ uint16 flowgram_values[SFF_NUMBER_OF_FLOWS_MAX]; // h->number_of_flows_per_read
+ uint8 flow_index_per_base[SFF_NUMBER_OF_BASES_MAX]; // r->number_of_bases
+ char bases[SFF_NUMBER_OF_BASES_MAX]; // r->number_of_bases
+ uint8 quality_scores[SFF_NUMBER_OF_BASES_MAX]; // r->number_of_bases
+
+ char quality[SFF_NUMBER_OF_BASES_MAX]; // quality_scores converted to CA-format qv
+};
+
+struct sffIndex {
+ uint64 _seqPos;
+ uint32 _seqLen;
+ uint32 _namLen;
+};
+
+
+class sffFile : public seqFile {
+protected:
+ sffFile(const char *filename);
+ sffFile();
+
+public:
+ ~sffFile();
+
+protected:
+ seqFile *openFile(const char *name);
+
+public:
+ uint32 find(const char *sequencename) {
+ assert(0);
+ return(0);
+ };
+
+ uint32 getSequenceLength(uint32 iid) { return(_index[iid]._seqLen); };
+
+ bool getSequence(uint32 iid,
+ char *&h, uint32 &hLen, uint32 &hMax,
+ char *&s, uint32 &sLen, uint32 &sMax);
+ bool getSequence(uint32 iid,
+ uint32 bgn, uint32 end, char *s);
+
+private:
+ void clear(void);
+
+ readBuffer *_rb;
+
+ sffHeader _header;
+ sffRead _read;
+
+ sffIndex *_index;
+
+ uint64 _firstReadLocation;
+ uint64 _readIID;
+
+ friend class seqFactory;
+};
+
+
+#endif // SFF_H
diff --git a/libseq/test-correctSequence.H b/libseq/test-correctSequence.H
new file mode 100644
index 0000000..d456502
--- /dev/null
+++ b/libseq/test-correctSequence.H
@@ -0,0 +1,151 @@
+#ifndef TEST_CORRECTSEQUENCE_H
+#define TEST_CORRECTSEQUENCE_H
+
+//#define WITH_WHITESPACE
+
+struct correctSequence_t {
+ char header[256];
+ uint32 headerLength;
+ char *sequence;
+ uint32 sequenceLength;
+};
+
+
+correctSequence_t *correctSequence = 0L;
+mt_s *mtctx = 0L;
+
+char *chainSeq;
+uint32 *chainSeqPos;
+uint32 *chainSeqIID;
+uint64 *chainStrPos;
+
+
+void
+generateCorrectSequence(uint32 minLen, uint32 maxLen, uint32 numSeq) {
+ char bases[4] = {'A', 'C', 'G', 'T'};
+
+ uint32 n = numSeq;
+ uint32 s = minLen;
+ uint32 l = maxLen;
+
+ uint32 seed = (uint32)(getTime() * 1000);
+
+ fprintf(stderr, "generateCorrectSequence()-- Using seed "uint32FMT"\n", seed);
+ fprintf(stderr, "generateCorrectSequence()-- Generating "uint32FMT" sequences of length "uint32FMT" to "uint32FMT"\n", numSeq, minLen, maxLen);
+
+ correctSequence = new correctSequence_t [n];
+ mtctx = mtInit(seed);
+
+ FILE *F = fopen("test-correctSequence.fasta", "w");
+
+ for (uint32 i=0; i<n; i++) {
+ uint32 j = s + (mtRandom32(mtctx) % (l-s));
+ uint32 p = 0;
+
+ sprintf(correctSequence[i].header, "sequence%d", i);
+
+ correctSequence[i].headerLength = strlen(correctSequence[i].header);
+ correctSequence[i].sequence = new char [j+1];
+ correctSequence[i].sequenceLength = j;
+
+ while (p < j)
+ correctSequence[i].sequence[p++] = bases[mtRandom32(mtctx) & 0x3];
+
+ correctSequence[i].sequence[p] = 0;
+
+ // Spend lots of pain adding whitespace to the file.
+
+#ifdef WITH_WHITESPACE
+ for (uint32 r=mtRandom32(mtctx) % 4; r--; )
+ fprintf(F, "\n");
+
+ fprintf(F, ">%s\n", correctSequence[i].header);
+
+ for (uint32 r=mtRandom32(mtctx) % 4; r--; )
+ fprintf(F, "\n");
+
+ for (uint32 p=0; p<correctSequence[i].sequenceLength; p++) {
+ fprintf(F, "%c", correctSequence[i].sequence[p]);
+ if ((mtRandom32(mtctx) % 100) == 0)
+ for (uint32 r=mtRandom32(mtctx) % 4; r--; )
+ fprintf(F, "\n");
+ }
+#else
+ fprintf(F, ">%s\n", correctSequence[i].header);
+ fprintf(F, "%s\n", correctSequence[i].sequence);
+#endif
+
+ }
+
+ for (uint32 r=mtRandom32(mtctx) % 4; r--; )
+ fprintf(F, "\n");
+
+ fclose(F);
+}
+
+
+void
+generateChainedAnswer(uint32 numSeq, char sep, uint32 sepLen) {
+ uint32 maxLen = 0;
+
+ for (uint32 i=0; i<numSeq; i++)
+ maxLen += correctSequence[i].sequenceLength + sepLen;
+
+ maxLen -= sepLen;
+
+ delete [] chainSeq;
+ delete [] chainSeqPos;
+ delete [] chainSeqIID;
+ delete [] chainStrPos;
+
+ chainSeq = new char [maxLen + 1];
+ chainSeqPos = new uint32 [maxLen];
+ chainSeqIID = new uint32 [maxLen];
+ chainStrPos = new uint64 [maxLen];
+
+ uint32 p = 0;
+ uint64 strpos = 0;
+
+ for (uint32 sid=0; sid<numSeq; sid++) {
+ for (uint32 ppp=0; ppp<correctSequence[sid].sequenceLength; ppp++, p++, strpos++) {
+ chainSeq[p] = correctSequence[sid].sequence[ppp];
+ chainSeqPos[p] = ppp;
+ chainSeqIID[p] = sid;
+ chainStrPos[p] = strpos;
+ }
+ if (sid+1 < numSeq) {
+ for (uint32 ppp=0; ppp<sepLen; ppp++, p++) {
+ chainSeq[p] = sep;
+ chainSeqPos[p] = ~uint32ZERO;
+ chainSeqIID[p] = ~uint32ZERO;
+ chainStrPos[p] = ~uint64ZERO;
+ }
+ }
+ }
+
+ assert(p == maxLen);
+
+ // Used to get the length of the string.
+ chainSeq[maxLen] = 0;
+}
+
+
+void
+removeCorrectSequence(uint32 numSeq) {
+
+ unlink("test-correctSequence.fasta");
+ unlink("test-correctSequence.fastaidx");
+
+ for (uint32 i=0; i<numSeq; i++)
+ delete [] correctSequence[i].sequence;
+
+ delete [] correctSequence;
+
+ delete [] chainSeq;
+ delete [] chainSeqPos;
+ delete [] chainSeqIID;
+ delete [] chainStrPos;
+}
+
+
+#endif // TEST_CORRECTSEQUENCE_H
diff --git a/libseq/test-merStream.C b/libseq/test-merStream.C
new file mode 100644
index 0000000..bc46639
--- /dev/null
+++ b/libseq/test-merStream.C
@@ -0,0 +1,284 @@
+#include "util.h"
+
+#include "seqCache.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+#include "test-correctSequence.H"
+
+#define FAIL() { err++; assert(0); }
+
+#warning HOW DO WE TEST IF WE GET ALL THE MERS?
+
+
+uint32
+testMerStreamSimple(merStream *MS, uint32 merSize, char *seq, uint32 *SP) {
+ uint32 err = 0;
+ uint32 pos = 0;
+ char testmer[32];
+ bool verbose = true;
+ bool nm = false;
+
+ if (verbose)
+ fprintf(stdout, "testMSsimple() begins.\n");
+
+ // Until we have no more mers in the input
+
+ while (seq[pos + merSize - 1] != 0) {
+ nm = MS->nextMer();
+
+ MS->theFMer().merToString(testmer);
+
+ if (verbose) {
+ fprintf(stdout, "MS pos="uint32FMT" posInSeq="uint64FMT" posInStr="uint64FMT" seqNum="uint64FMT"\n",
+ pos,
+ MS->thePositionInSequence(),
+ MS->thePositionInStream(),
+ MS->theSequenceNumber());
+ if (strncmp(testmer, seq + pos, merSize))
+ fprintf(stdout, "MS pos="uint32FMT" failed '%s' != '%s'.\n", pos, testmer, seq + pos);
+ }
+
+ assert(nm == true);
+ assert(MS->thePositionInSequence() == SP[pos]);
+ assert(MS->thePositionInStream() == SP[pos]);
+ assert(MS->theSequenceNumber() == 0);
+ assert(strncmp(testmer, seq + pos, merSize) == 0);
+
+ pos++;
+ }
+
+ // Should have no more mers
+
+ nm = MS->nextMer();
+ assert(nm == false);
+
+ return(err);
+}
+
+
+
+uint32
+testMerStreamOperation(merStream *MS, uint32 beg, uint32 end, uint32 sepLen) {
+ uint32 err = 0;
+
+ char fmerstr[256];
+ char rmerstr[256];
+ char cmerstr[256];
+ char tmerstr[256];
+
+ while (MS->nextMer()) {
+ MS->theFMer().merToString(fmerstr);
+ MS->theRMer().merToString(rmerstr);
+ MS->theCMer().merToString(cmerstr);
+
+ if ((strcmp(fmerstr, cmerstr) != 0) && ((strcmp(rmerstr, cmerstr) != 0))) {
+ fprintf(stderr, "mer strings disagree; F:%s R:%s C:%s\n", fmerstr, rmerstr, cmerstr);
+ FAIL();
+ }
+
+ reverseComplementSequence(rmerstr, strlen(rmerstr));
+
+ if (strcmp(fmerstr, rmerstr) != 0) {
+ fprintf(stderr, "mer strings disagree after reverse; F:%s R:%s\n", fmerstr, rmerstr);
+ FAIL();
+ }
+
+ uint32 pseq = MS->thePositionInSequence();
+ uint32 pstr = MS->thePositionInStream();
+ uint32 piid = MS->theSequenceNumber();
+
+ uint32 mersize = MS->theFMer().getMerSize();
+ uint32 merspan = MS->theFMer().getMerSpan();
+
+#if 0
+ if (beg > 10) {
+ uint32 pp = pstr + piid * sepLen - 10;
+ uint32 xx = 0;
+
+ fprintf(stderr, "beg="uint32FMT" pstr="uint32FMT" '", beg, pstr);
+
+ for (xx=0; xx<10; xx++, pp++)
+ fprintf(stderr, "%c", chainSeq[pp]);
+ fprintf(stderr, ":");
+ for (xx=0; xx<merspan; xx++, pp++)
+ fprintf(stderr, "%c", chainSeq[pp]);
+ fprintf(stderr, ":");
+ for (xx=0; xx<10; xx++, pp++)
+ fprintf(stderr, "%c", chainSeq[pp]);
+
+ fprintf(stderr, "'\n");
+ }
+#endif
+
+ if (mersize == merspan) {
+ strncpy(tmerstr, correctSequence[piid].sequence + pseq, mersize);
+ tmerstr[mersize] = 0;
+
+ if (strcmp(fmerstr, tmerstr) != 0) {
+ fprintf(stderr, "mer string doesn't agree with sequence; '%s' vs '%s'.\n", fmerstr, tmerstr);
+ FAIL();
+ }
+
+ if ((pstr < beg) || (end < pstr)) {
+ fprintf(stderr, "mer stream position out of range; at "uint32FMT", range "uint32FMT"-"uint32FMT"\n",
+ pstr, beg, end);
+ FAIL();
+ }
+
+ // The pstr returned above is the ACGT position, not the
+ // chainSeq position we expect. Trusting that the IID is
+ // correct (if not, the previous strcmp() would have failed) we
+ // can add in the missing separators to get a chainSeq
+ // position.
+
+ strncpy(tmerstr, chainSeq + pstr + piid * sepLen, mersize);
+ tmerstr[mersize] = 0;
+
+ if (strcmp(fmerstr, tmerstr) != 0) {
+ fprintf(stderr, "mer string doesn't agree with stream; '%s' vs '%s'.\n", fmerstr, tmerstr);
+ FAIL();
+ }
+ }
+ }
+
+ return(err);
+}
+
+
+
+
+uint32
+testMerStream(kMerBuilder *KB, uint32 numSeq, char sep, uint32 sepLen) {
+ uint32 err = 0;
+ seqStream *ST = 0L;
+ merStream *MS = 0L;
+
+ generateChainedAnswer(numSeq, sep, sepLen);
+
+ if (numSeq > 1) {
+ ST = new seqStream("test-correctSequence.fasta");
+ ST->setSeparator(sep, sepLen);
+ } else {
+ ST = new seqStream(correctSequence[0].sequence, correctSequence[0].sequenceLength);
+ }
+
+ MS = new merStream(KB, ST, true, true);
+
+ uint32 maxLen = ST->startOf(numSeq-1) + ST->lengthOf(numSeq-1);
+
+ // Whole thing, rewind, whole thing
+
+ fprintf(stderr, "whole thing.\n");
+
+ err += testMerStreamOperation(MS, 0, maxLen, sepLen);
+ MS->rewind();
+ err += testMerStreamOperation(MS, 0, maxLen, sepLen);
+
+
+ // Random subsets - we're not terribly interested in streaming,
+ // just getting the start/end correct.
+
+ fprintf(stderr, "subsets.\n");
+
+ for (uint32 iter=0; iter<500; iter++) {
+ uint32 beg = mtRandom32(mtctx) % maxLen;
+ uint32 end = (beg + 10000 < maxLen) ? (beg + 10000) : maxLen;
+
+ //fprintf(stderr, "subsets - "uint32FMT"-"uint32FMT"\n", beg, end);
+
+ MS->setBaseRange(beg, end);
+
+ err += testMerStreamOperation(MS, beg, end, sepLen);
+ MS->rewind();
+ err += testMerStreamOperation(MS, beg, end, sepLen);
+ }
+
+ delete MS;
+
+ return(err);
+}
+
+
+
+
+
+int
+main(int argc, char **argv) {
+ uint32 minLen = 1000;
+ uint32 maxLen = 200000;
+ uint32 numSeq = 1000;
+ uint32 err = 0;
+
+ // Very simple merStream test
+
+ {
+ fprintf(stdout, "merStream(kMerBuilder(20), ...)\n");
+
+ merStream *MS = new merStream(new kMerBuilder(20),
+ new seqStream("GGGTCAACTCCGCCCGCACTCTAGC", 25),
+ true, true);
+ uint32 SP[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+
+ testMerStreamSimple(MS, 20, "GGGTCAACTCCGCCCGCACTCTAGC", SP);
+ MS->rewind();
+ testMerStreamSimple(MS, 20, "GGGTCAACTCCGCCCGCACTCTAGC", SP);
+ MS->rewind();
+ MS->rewind();
+ testMerStreamSimple(MS, 20, "GGGTCAACTCCGCCCGCACTCTAGC", SP);
+
+ delete MS;
+
+ fprintf(stdout, "merStream(kMerBuilder(20), ...) - PASSED\n");
+ }
+
+ {
+ fprintf(stdout, "merStream(kMerBuilder(20, 1), ...)\n");
+
+ merStream *MS = new merStream(new kMerBuilder(20, 1),
+ new seqStream("GGGAATTTTCAACTCCGCCCGCACTCTAGCCCAAA", 35),
+ true, true);
+ uint32 SP[10] = { 0, 3, 5, 9, 10, 12 };
+
+ testMerStreamSimple(MS, 20, "GATCACTCGCGCACTCTAGCA", SP);
+ MS->rewind();
+ testMerStreamSimple(MS, 20, "GATCACTCGCGCACTCTAGCA", SP);
+ MS->rewind();
+ MS->rewind();
+ testMerStreamSimple(MS, 20, "GATCACTCGCGCACTCTAGCA", SP);
+
+ delete MS;
+
+ fprintf(stdout, "merStream(kMerBuilder(20, 1), ...) - PASSED\n");
+ }
+
+ // Move on to harder tests
+
+ generateCorrectSequence(minLen, maxLen, numSeq);
+
+ // Tests seqStream(string, strlen) construction method
+
+ fprintf(stderr, "err += testMerStream(new kMerBuilder(20, 0, 0L), 1, '.', 1);\n");
+ err += testMerStream(new kMerBuilder(20, 0, 0L), 1, '.', 1);
+
+ fprintf(stderr, "err += testMerStream(new kMerBuilder(22, 1, 0L), 1, '.', 1);\n");
+ err += testMerStream(new kMerBuilder(22, 1, 0L), 1, '.', 1);
+
+ // Tests seqStream(filename) construction method
+
+ fprintf(stderr, "err += testMerStream(new kMerBuilder(20, 0, 0L), numSeq, '.', 1);\n");
+ err += testMerStream(new kMerBuilder(20, 0, 0L), numSeq, '.', 1);
+
+ fprintf(stderr, "err += testMerStream(new kMerBuilder(28, 0, 0L), numSeq, '.', 100);\n");
+ err += testMerStream(new kMerBuilder(28, 0, 0L), numSeq, '.', 100);
+
+ fprintf(stderr, "err += testMerStream(new kMerBuilder(24, 4, 0L), numSeq, '.', 100);\n");
+ err += testMerStream(new kMerBuilder(24, 4, 0L), numSeq, '.', 100);
+
+ removeCorrectSequence(numSeq);
+
+ if (err == 0)
+ fprintf(stderr, "Success!\n");
+
+ exit(err > 0);
+}
diff --git a/libseq/test-seqCache.C b/libseq/test-seqCache.C
new file mode 100644
index 0000000..3fbb8dd
--- /dev/null
+++ b/libseq/test-seqCache.C
@@ -0,0 +1,181 @@
+#include "util.h"
+
+#include "seqCache.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+#include "test-correctSequence.H"
+
+
+uint32
+testSeqVsCorrect(seqInCore *S, uint32 testID) {
+ uint32 err = 0;
+
+ if (S == 0L) {
+ fprintf(stderr, "testID:"uint32FMT" - empty sequence\n", testID);
+ return(1);
+ }
+
+ uint32 sid = S->getIID();
+
+ if (strcmp(S->header(), correctSequence[sid].header) != 0) {
+ fprintf(stderr, "testID:"uint32FMT" - header differs '%s' vs '%s'\n", testID, S->header(), correctSequence[sid].header);
+ err++;
+ }
+ if (S->headerLength() != correctSequence[sid].headerLength) {
+ fprintf(stderr, "testID:"uint32FMT" - header length differs "uint32FMT" vs "uint32FMT"\n", testID, S->headerLength(), correctSequence[sid].headerLength);
+ err++;
+ }
+ if (strcmp(S->sequence(), correctSequence[sid].sequence) != 0) {
+ fprintf(stderr, "testID:"uint32FMT" - sequence differs\n", testID);
+ err++;
+ }
+ if (strlen(S->sequence()) != correctSequence[sid].sequenceLength) {
+ fprintf(stderr, "testID:"uint32FMT" - sequence length differs strlen "uint32FMT" vs "uint32FMT"\n", testID, (uint32)strlen(S->sequence()), correctSequence[sid].sequenceLength);
+ err++;
+ }
+ if (S->sequenceLength() != correctSequence[sid].sequenceLength) {
+ fprintf(stderr, "testID:"uint32FMT" - sequence length differs "uint32FMT" vs "uint32FMT"\n", testID, S->sequenceLength(), correctSequence[sid].sequenceLength);
+ err++;
+ }
+
+ return(err);
+}
+
+
+uint32
+testSeqCacheIDLookups(seqCache *SC) {
+ uint32 err = 0;
+ uint32 numSeq = SC->getNumberOfSequences();
+ double start = getTime();
+
+ // 1 - getSequenceIID()
+ fprintf(stderr, "1 - getSequenceIID()\n");
+ for (uint32 sid=0; sid<numSeq; sid++) {
+ if (sid != SC->getSequenceIID(correctSequence[sid].header)) {
+ fprintf(stderr, "2 - failed to find name '%s'\n", correctSequence[sid].header);
+ err++;
+ }
+ }
+
+ fprintf(stderr, "Test took %f seconds.\n", getTime() - start);
+
+ return(err);
+}
+
+
+uint32
+testSeqCache(seqCache *SC) {
+ uint32 err = 0;
+ uint32 numSeq = SC->getNumberOfSequences();
+ seqInCore *S = 0L;
+ double start = getTime();
+
+ // 0 - getSequenceLength()
+ fprintf(stderr, "0 - getSequenceLength()\n");
+ for (uint32 sid=0; sid<numSeq; sid++)
+ if (SC->getSequenceLength(sid) != correctSequence[sid].sequenceLength) {
+ fprintf(stderr, "1 - length differs.\n");
+ err++;
+ }
+
+ // 2 - stream with getSequenceInCore()
+ fprintf(stderr, "2 - stream with getSequenceInCore()\n");
+ S = SC->getSequenceInCore();
+ while (S != 0L) {
+ err += testSeqVsCorrect(S, 2);
+ delete S;
+ S = SC->getSequenceInCore();
+ }
+
+ // 3 - iterate with getSequenceInCore(sid++)
+ fprintf(stderr, "3 - iterate with getSequenceInCore(sid++)\n");
+ for (uint32 sid=0; sid<numSeq; sid++) {
+ S = SC->getSequenceInCore(sid);
+ err += testSeqVsCorrect(S, 3);
+ delete S;
+ }
+
+ // 4 - random with getSequenceInCore(sid)
+ fprintf(stderr, "4 - random with getSequenceInCore(sid)\n");
+ for (uint32 cnt=0; cnt<4*numSeq; cnt++) {
+ uint32 sid = mtRandom32(mtctx) % numSeq;
+ S = SC->getSequenceInCore(sid);
+ err += testSeqVsCorrect(S, 4);
+ delete S;
+ }
+
+ fprintf(stderr, "Test took %f seconds.\n", getTime() - start);
+
+ return(err);
+}
+
+
+int
+main(int argc, char **argv) {
+ uint32 minLen = 100;
+ uint32 maxLen = 2000;
+ uint32 numSeq = 100000;
+ seqCache *SC = 0L;
+ uint32 err = 0;
+
+ generateCorrectSequence(minLen, maxLen, numSeq);
+
+ fprintf(stderr, "seqCache(file, 0, true) (ID lookups)\n");
+ SC = new seqCache("test-correctSequence.fasta", 0, true);
+ //err += testSeqCacheIDLookups(SC);
+ delete SC;
+
+ fprintf(stderr, "seqCache(file, 0, true)\n");
+ SC = new seqCache("test-correctSequence.fasta", 0, true);
+ err += testSeqCache(SC);
+ delete SC;
+
+ fprintf(stderr, "seqCache(file, 1, true)\n");
+ SC = new seqCache("test-correctSequence.fasta", 1, true);
+ err += testSeqCache(SC);
+ delete SC;
+
+ fprintf(stderr, "seqCache(file, 2, true)\n");
+ SC = new seqCache("test-correctSequence.fasta", 2, true);
+ err += testSeqCache(SC);
+ delete SC;
+
+ fprintf(stderr, "seqCache(file, 4, true)\n");
+ SC = new seqCache("test-correctSequence.fasta", 4, true);
+ err += testSeqCache(SC);
+ delete SC;
+
+ fprintf(stderr, "seqCache(file, 8, true)\n");
+ SC = new seqCache("test-correctSequence.fasta", 8, true);
+ err += testSeqCache(SC);
+ delete SC;
+
+ fprintf(stderr, "seqCache(file, 32, true)\n");
+ SC = new seqCache("test-correctSequence.fasta", 32, true);
+ err += testSeqCache(SC);
+ delete SC;
+
+ fprintf(stderr, "seqCache(file, 200, true)\n");
+ SC = new seqCache("test-correctSequence.fasta", 200, true);
+ err += testSeqCache(SC);
+ delete SC;
+
+ fprintf(stderr, "seqCache(file, 1000000, true)\n");
+ SC = new seqCache("test-correctSequence.fasta", 1000000, true);
+ err += testSeqCache(SC);
+ delete SC;
+
+ fprintf(stderr, "seqCache(file, 0, true) -- loadAllSequence\n");
+ SC = new seqCache("test-correctSequence.fasta", 0, true);
+ SC->loadAllSequences();
+ err += testSeqCache(SC);
+ delete SC;
+
+ removeCorrectSequence(numSeq);
+
+ if (err == 0)
+ fprintf(stderr, "Success!\n");
+
+ exit(err > 0);
+}
diff --git a/libseq/test-seqStream.C b/libseq/test-seqStream.C
new file mode 100644
index 0000000..c216a2c
--- /dev/null
+++ b/libseq/test-seqStream.C
@@ -0,0 +1,287 @@
+#include "util.h"
+
+#include "seqCache.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+#include "test-correctSequence.H"
+
+#define FAIL() { err++; assert(0); }
+
+
+uint32
+testIndexing(uint32 numSeq, char sep, uint32 sepLen) {
+ uint32 err = 0;
+ seqStream *ST = 0L;
+
+ fprintf(stderr, "testIndexing()-- numSeq="uint32FMT" sep=%c sepLen="uint32FMT"\n", numSeq, sep, sepLen);
+
+ generateChainedAnswer(numSeq, sep, sepLen);
+
+ if (numSeq > 1) {
+ ST = new seqStream("test-correctSequence.fasta");
+ ST->setSeparator(sep, sepLen);
+ } else {
+ ST = new seqStream(correctSequence[0].sequence, correctSequence[0].sequenceLength);
+ }
+
+ uint32 maxLen = ST->startOf(numSeq-1) + ST->lengthOf(numSeq-1);
+
+ // Basic checks on the reverse lookup - this is state independent;
+ // it changes only based on the separator length. In other words,
+ // there is no need to check this while iterating through the
+ // seqStream.
+
+ fprintf(stderr, "IGNORE THIS WARNING: ");
+ if (ST->sequenceNumberOfPosition(maxLen) != ~uint32ZERO) {
+ fprintf(stderr, "maxLen too small.\n");
+ FAIL();
+ }
+ if (ST->sequenceNumberOfPosition(maxLen - 1) == ~uint32ZERO) {
+ fprintf(stderr, "maxLen too big.\n");
+ FAIL();
+ }
+
+ // Check all lookups - lengthOf() and IIDOf() are implicitly
+ // checked by the operation of seqStream (get() mostly). startOf()
+ // isn't, but inserting errors in setRange() led to
+ // infinite-looking loops.
+
+ uint64 pos = 0;
+ uint64 sta = 0;
+
+ for (uint32 sid=0; sid<numSeq; sid++) {
+ if (ST->lengthOf(sid) != correctSequence[sid].sequenceLength) {
+ fprintf(stderr, "lengthOf "uint32FMT" returned "uint32FMT", not correct "uint32FMT"\n",
+ sid, ST->lengthOf(sid), correctSequence[sid].sequenceLength);
+ FAIL();
+ }
+ if (ST->startOf(sid) != sta) {
+ fprintf(stderr, "startOf "uint32FMT" returned "uint64FMT", not correct "uint64FMT"\n",
+ sid, ST->startOf(sid), sta);
+ FAIL();
+ }
+ if (ST->IIDOf(sid) != sid) {
+ fprintf(stderr, "IIDOf "uint32FMT" returned "uint32FMT", not correct "uint32FMT"\n",
+ sid, ST->IIDOf(sid), sid);
+ FAIL();
+ }
+
+ sta += correctSequence[sid].sequenceLength;
+
+ for (uint32 ppp=0; ppp<correctSequence[sid].sequenceLength; ppp++, pos++) {
+ if (ST->sequenceNumberOfPosition(pos) != sid) {
+ fprintf(stderr, "sequenceNumberOfPosition "uint64FMT" returned "uint32FMT", not correct "uint32FMT".\n",
+ pos, ST->sequenceNumberOfPosition(pos), sid);
+ FAIL();
+ }
+ }
+ }
+ if (pos != maxLen) {
+ fprintf(stderr, "maxLen wrong.\n");
+ FAIL();
+ }
+
+ // Check the separator. Seek to a spot right before one, and count
+ // that we have the correct length. More rigorously tested in
+ // testChaining().
+
+ for (uint32 sid=0; sid<numSeq-1; sid++) {
+ ST->setRange(ST->startOf(sid) + ST->lengthOf(sid)-1, ~uint64ZERO);
+ ST->get();
+ for (uint32 x=0; x<sepLen; x++) {
+ char s = ST->get();
+ if (s != sep) {
+ fprintf(stderr, "wrong separator at sep "uint32FMT" got %d expected %d\n", x, s, sep);
+ FAIL();
+ }
+ }
+ if (ST->get() == sep) {
+ fprintf(stderr, "too many separators!\n");
+ FAIL();
+ }
+ }
+
+ delete ST;
+
+ return(err);
+}
+
+
+
+uint32
+testSeqStream(seqStream *ST, uint32 sib, uint32 sie, char sep) {
+ uint32 err = 0;
+
+ while (ST->eof() == false) {
+ uint32 sp = ST->seqPos();
+ uint32 si = ST->seqIID();
+ uint64 st = ST->strPos();
+ char ch = ST->get();
+
+ if (ch != 0) {
+ if (ch != chainSeq[sib]) {
+ fprintf(stderr, "sp="uint32FMT" si="uint32FMT" st="uint64FMT" ch=%c -- letter wrong got'%c'\n", sp, si, st, ch, chainSeq[sib]);
+ FAIL();
+ }
+ if ((ch != sep) && (sp != chainSeqPos[sib])) {
+ fprintf(stderr, "sp="uint32FMT" si="uint32FMT" st="uint64FMT" ch=%c -- seqPos wrong got "uint32FMT"\n", sp, si, st, ch, chainSeqPos[sib]);
+ FAIL();
+ }
+ if ((ch != sep) && (si != chainSeqIID[sib])) {
+ fprintf(stderr, "sp="uint32FMT" si="uint32FMT" st="uint64FMT" ch=%c -- seqIID wrong got"uint32FMT"\n", sp, si, st, ch, chainSeqIID[sib]);
+ FAIL();
+ }
+ if ((ch != sep) && (st != chainStrPos[sib])) {
+ fprintf(stderr, "sp="uint32FMT" si="uint32FMT" st="uint64FMT" ch=%c -- strPos wrong got "uint64FMT"\n", sp, si, st, ch, chainStrPos[sib]);
+ FAIL();
+ }
+
+ sib++;
+ }
+ }
+
+ if (sib != sie) {
+ fprintf(stderr, "iterated length wrong; sib="uint32FMT" sie="uint32FMT"\n", sib, sie);
+ FAIL();
+ }
+
+ return(err);
+}
+
+
+
+uint32
+testChaining(uint32 numSeq, char sep, uint32 sepLen) {
+ uint32 err = 0;
+ seqStream *ST = 0L;
+
+ fprintf(stderr, "testChaining()-- numSeq="uint32FMT" sep=%c sepLen="uint32FMT"\n", numSeq, sep, sepLen);
+
+ generateChainedAnswer(numSeq, sep, sepLen);
+
+ if (numSeq > 1) {
+ ST = new seqStream("test-correctSequence.fasta");
+ ST->setSeparator(sep, sepLen);
+ } else {
+ ST = new seqStream(correctSequence[0].sequence, correctSequence[0].sequenceLength);
+ }
+
+ // Do a test on the whole thing.
+
+ {
+ uint32 sib = 0;
+ uint32 sie = strlen(chainSeq);
+
+ fprintf(stderr, "initial test with full range\n");
+ testSeqStream(ST, sib, sie, sep);
+
+ fprintf(stderr, "initial test with full range (rewind)\n");
+ ST->rewind();
+ testSeqStream(ST, sib, sie, sep);
+ }
+
+
+ // Set the range to random values, and check all the results.
+ // We've already verified the index works, so we're free to use
+ // that (but we currently don't).
+
+ uint32 maxLen = ST->startOf(numSeq-1) + ST->lengthOf(numSeq-1);
+
+ fprintf(stderr, "test on subranges\n");
+
+ for (uint32 iter=0; iter<500; iter++) {
+ uint32 beg = mtRandom32(mtctx) % maxLen;
+ uint32 end = mtRandom32(mtctx) % maxLen;
+ if (beg > end) {
+ uint32 t = end;
+ end = beg;
+ beg = t;
+ }
+
+ ST->setRange(beg, end);
+
+ // Compute the position in our stream for the ACGT based beg and
+ // end. The quirk here is that our stream includes the
+ // separator.
+
+ uint32 sib = 0; // chainSeq position
+ uint32 sie = 0;
+
+ for (uint32 ppp=0, sid=0; sid<numSeq; sid++) {
+ uint32 len = correctSequence[sid].sequenceLength;
+
+ if ((ppp <= beg) && (beg < ppp + len)) {
+ sib += beg - ppp;
+ break;
+ }
+
+ ppp += len;
+ sib += len + sepLen;
+ }
+
+ for (uint32 ppp=0, sid=0; sid<numSeq; sid++) {
+ uint32 len = correctSequence[sid].sequenceLength;
+
+ if ((ppp <= end) && (end < ppp + len)) {
+ sie += end - ppp;
+ break;
+ }
+
+ ppp += len;
+ sie += len + sepLen;
+ }
+
+ // Optionally do a rewind in the middle
+
+ if (iter % 2) {
+ //fprintf(stderr, "Random iter "uint32FMT" (with rewind)\n", iter);
+ while (ST->eof() == false)
+ ST->get();
+ ST->rewind();
+ } else {
+ //fprintf(stderr, "Random iter "uint32FMT"\n", iter);
+ }
+
+
+ testSeqStream(ST, sib, sie, sep);
+ }
+
+ return(err > 0);
+}
+
+
+
+int
+main(int argc, char **argv) {
+ uint32 minLen = 100;
+ uint32 maxLen = 20000;
+ uint32 numSeq = 1000;
+ uint32 err = 0;
+
+ generateCorrectSequence(minLen, maxLen, numSeq);
+
+ // Tests seqStream(string, strlen) construction method
+
+ err += testIndexing(1, '.', 1);
+ err += testChaining(1, '.', 1);
+
+ // Tests seqStream(filename) construction method
+
+ err += testIndexing(numSeq, '.', 1);
+ err += testIndexing(numSeq, ':', 10);
+ err += testIndexing(numSeq, 'z', 100);
+ err += testIndexing(numSeq, '-', 1000);
+
+ err += testChaining(numSeq, '.', 1);
+ err += testChaining(numSeq, ':', 10);
+ err += testChaining(numSeq, 'z', 100);
+ err += testChaining(numSeq, '-', 1000);
+
+ removeCorrectSequence(numSeq);
+
+ if (err == 0)
+ fprintf(stderr, "Success!\n");
+
+ exit(err > 0);
+}
diff --git a/libseq/test/Makefile b/libseq/test/Makefile
new file mode 100644
index 0000000..bc03f4b
--- /dev/null
+++ b/libseq/test/Makefile
@@ -0,0 +1,23 @@
+
+
+PROG = test-merstream-speed
+
+INCLUDE = -I.. -I../../libutil -I../../libbio -I../../libseq
+LIBS = -L.. -L../../libutil -L../../libbio -L../../libseq -lseq -lbio -lutil -lm
+OBJS =
+
+include ../../Make.compilers
+
+all: $(PROG)
+ @echo Tests passed!
+
+test-merstream-speed: test-merstream-speed.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-merstream-speed.o test-merstream-speed.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-merstream-speed test-merstream-speed.o $(LIBS)
+ ../../leaff/leaff -G 10000 1000 10000 > junk.fasta
+ cat junk.fasta > /dev/null
+ ./test-merstream-speed junk.fasta
+ rm -f junk*
+
+clean:
+ rm -f $(PROG) *.o *junk*
diff --git a/libseq/test/test-merstream-speed.C b/libseq/test/test-merstream-speed.C
new file mode 100644
index 0000000..7eac3d1
--- /dev/null
+++ b/libseq/test/test-merstream-speed.C
@@ -0,0 +1,52 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "bio++.H"
+
+#include "seqCache.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+int
+main(int argc, char **argv) {
+ speedCounter *C = 0L;
+ FILE *F = 0L;
+ seqStream *S = 0L;
+ merStream *M = 0L;
+
+ if (argc != 2) {
+ fprintf(stderr, "usage: %s some.fasta\n", argv[0]);
+ fprintf(stderr, "Reads some.fasta using fgetc(), the seqStream and the merStream,\n");
+ fprintf(stderr, "reporting the speed of each method.\n");
+ exit(1);
+ }
+
+ ////////////////////////////////////////
+ F = fopen(argv[1], "r");
+ C = new speedCounter("fgetc(): %7.2f Mthings -- %5.2f Mthings/second\r", 1000000.0, 0x3fffff, true);
+ while (!feof(F))
+ fgetc(F), C->tick();
+ delete C;
+ fclose(F);
+
+ ////////////////////////////////////////
+ S = new seqStream(argv[1]);
+ C = new speedCounter("seqStream: %7.2f Mthings -- %5.2f Mthings/second\r", 1000000.0, 0x3fffff, true);
+ while (S->get())
+ C->tick();
+ delete C;
+ delete S;
+
+ ////////////////////////////////////////
+ M = new merStream(new kMerBuilder(20),
+ new seqStream(argv[1]),
+ true, true);
+ C = new speedCounter("seqStream -> merStream: %7.2f Mthings -- %5.2f Mthings/second\r", 1000000.0, 0x3fffff, true);
+ while (M->nextMer())
+ C->tick();
+ delete C;
+ delete M;
+
+ exit(0);
+}
+
diff --git a/libsim4/Make.include b/libsim4/Make.include
new file mode 100644
index 0000000..f0bb4e7
--- /dev/null
+++ b/libsim4/Make.include
@@ -0,0 +1,78 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBS4P/ :=$(realpath $/sim4polish/)/
+
+s4csrc := $/sim4core/sim4command.C \
+ $/sim4core/sim4parameters.C \
+ $/sim4core/sim4string.C \
+ $/sim4core/Xtend1.C \
+ $/sim4core/align.C \
+ $/sim4core/exon.H \
+ $/sim4core/exon_cores.C \
+ $/sim4core/extend.C \
+ $/sim4core/glimmerSplice.C \
+ $/sim4core/greedy.C \
+ $/sim4core/mspManager.C \
+ $/sim4core/mspManager.H \
+ $/sim4core/pluri_align.C \
+ $/sim4core/poly.C \
+ $/sim4core/sim4.H \
+ $/sim4core/sim4b1.C \
+ $/sim4core/sim4b1a.C \
+ $/sim4core/sim4b1-1.C \
+ $/sim4core/sim4b1-2.C \
+ $/sim4core/sim4b1-3.C \
+ $/sim4core/sim4b1-4.C \
+ $/sim4core/sim4b1_s.C \
+ $/sim4core/sim4defines.H \
+ $/sim4core/sim4parameters.H \
+ $/sim4core/sites.C \
+ $/sim4core/sites_donor.C \
+ $/sim4core/sites_acceptor.C \
+ $/sim4core/sites_score.C \
+ $/sim4core/splice.C \
+ $/sim4core/table.C \
+ $/sim4core/util.C
+
+s4psrc :=$/sim4polish/sim4polish-compare.C \
+ $/sim4polish/sim4polish-copy.C \
+ $/sim4polish/sim4polish-deleteexon.C \
+ $/sim4polish/sim4polish-exons.C \
+ $/sim4polish/sim4polish-polishtostring.C \
+ $/sim4polish/sim4polish-read.C \
+ $/sim4polish/sim4polish-stringtopolish.C \
+ $/sim4polish/sim4polish-updatescores.C \
+ $/sim4polish/sim4polish.C \
+ $/sim4polish/sim4polish.H \
+ $/sim4polish/sim4polishList.C \
+ $/sim4polish/sim4polishList.H \
+ $/sim4polish/sim4polishBuilder.C \
+ $/sim4polish/sim4polishBuilder.H \
+ $/sim4polish/sim4polishFile.C \
+ $/sim4polish/sim4polishFile.H \
+ $/sim4polish/sim4polishReader.C \
+ $/sim4polish/sim4polishReader.H \
+ $/sim4polish/sim4polishWriter.C \
+ $/sim4polish/sim4polishWriter.H
+
+$/.C_SRCS := $(filter %.c,${s4csrc}) $(filter %.c,${s4psrc})
+$/.C_INCS := $(filter %.h,${s4csrc}) $(filter %.h,${s4psrc})
+$/.CXX_SRCS := $(filter %.C,${s4csrc}) $(filter %.C,${s4psrc})
+$/.CXX_INCS := $(filter %.H,${s4csrc}) $(filter %.H,${s4psrc})
+$/.CXX_LIBS := $/libsim4.a
+
+$/.CLEAN := $/*.o $/sim4core/*.o $/sim4polish/*.o
+
+$/.CXX_LIBS := $/libsim4.a
+
+$/.PERL_LIBS := $/sim4polish/sim4polish.pm
+
+$/libsim4.a: ${$/.C_SRCS:.c=.o} ${$/.CXX_SRCS:.C=.o}
+
+$(eval $/%.d $/%.o: CFLAGS += -I${LIBUTL/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBS4P/})
+$(eval $/%.d $/%.o: CXXFLAGS += -I${LIBUTL/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBS4P/})
+
+
diff --git a/libsim4/sim4.H b/libsim4/sim4.H
new file mode 100644
index 0000000..93aa55c
--- /dev/null
+++ b/libsim4/sim4.H
@@ -0,0 +1,9 @@
+#include "sim4polish/sim4polish.H"
+
+#include "sim4polish/sim4polishList.H"
+#include "sim4polish/sim4polishFile.H"
+#include "sim4polish/sim4polishBuilder.H"
+#include "sim4polish/sim4polishReader.H"
+#include "sim4polish/sim4polishWriter.H"
+
+#include "sim4core/sim4.H"
diff --git a/libsim4/sim4core/CHANGES b/libsim4/sim4core/CHANGES
new file mode 100644
index 0000000..8897a49
--- /dev/null
+++ b/libsim4/sim4core/CHANGES
@@ -0,0 +1,26 @@
+Fri Apr 26 14:24:29 EDT 2002
+Test for overlapping exons in sim4string.C. If SHOW_OVERLAPPING_EXONS
+is defined, they will be printed to stderr.
+
+
+Mon Apr 29 03:57:10 EDT 2002 (build 2333)
+Added '-V' option to print script lines as they are processed.
+Changed the wording of the status ('-v') output.
+
+
+Wed Aug 14 12:59:42 EDT 2002
+Added -forcestrand to force the strand prediction to always be
+'forward' or 'reverse'. Changes to sim4string.C/run() and
+util.C/slide_intron(), and sim4db.H. Added
+dbParams._forceStrandPrediction flag to enable/disable this behavior.
+
+Wed Aug 14 16:18:03 EDT 2002
+Fixed sync_slide_intron to increase the limit of Glist, Clist and oris
+from 200 exons to anything. titin mapped to ncbi human genome had
+more than 200 exons.
+
+Wed Aug 28 14:03:50 EDT 2002
+Fixed get_stats() to _not_ check/reset the strand prediction when
+-forcestrand is in effect. In addition, fixed run() to set the
+orientation of 'unknown' matches to FWD first. For complement matches,
+this will later become BWD.
diff --git a/libsim4/sim4core/GlimmerModels/acceptors.162.neg.icm b/libsim4/sim4core/GlimmerModels/acceptors.162.neg.icm
new file mode 100644
index 0000000..4ade021
Binary files /dev/null and b/libsim4/sim4core/GlimmerModels/acceptors.162.neg.icm differ
diff --git a/libsim4/sim4core/GlimmerModels/acceptors.162.pos.icm b/libsim4/sim4core/GlimmerModels/acceptors.162.pos.icm
new file mode 100644
index 0000000..ba92f25
Binary files /dev/null and b/libsim4/sim4core/GlimmerModels/acceptors.162.pos.icm differ
diff --git a/libsim4/sim4core/GlimmerModels/donors.162.neg.icm b/libsim4/sim4core/GlimmerModels/donors.162.neg.icm
new file mode 100644
index 0000000..404180a
Binary files /dev/null and b/libsim4/sim4core/GlimmerModels/donors.162.neg.icm differ
diff --git a/libsim4/sim4core/GlimmerModels/donors.162.pos.icm b/libsim4/sim4core/GlimmerModels/donors.162.pos.icm
new file mode 100644
index 0000000..15f124b
Binary files /dev/null and b/libsim4/sim4core/GlimmerModels/donors.162.pos.icm differ
diff --git a/libsim4/sim4core/Make.include b/libsim4/sim4core/Make.include
new file mode 100644
index 0000000..5922252
--- /dev/null
+++ b/libsim4/sim4core/Make.include
@@ -0,0 +1,47 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+LIBBIO/ :=$(realpath $/../../libbio/)/
+LIBS4P/ :=$(realpath $/../sim4polish/)/
+
+src := $/sim4command.C \
+ $/sim4parameters.C \
+ $/sim4string.C \
+ $/Xtend1.C \
+ $/align.C \
+ $/exon.H \
+ $/exon_cores.C \
+ $/extend.C \
+ $/glimmerSplice.C \
+ $/glimmerSplice.H \
+ $/greedy.C \
+ $/mspManager.C \
+ $/mspManager.H \
+ $/pluri_align.C \
+ $/poly.C \
+ $/sim4.H \
+ $/sim4b1.C \
+ $/sim4b1a.C \
+ $/sim4b1-1.C \
+ $/sim4b1-2.C \
+ $/sim4b1-3.C \
+ $/sim4b1-4.C \
+ $/sim4b1_s.C \
+ $/sim4defines.H \
+ $/sim4parameters.H \
+ $/sites.C \
+ $/sites_acceptor.C \
+ $/sites_donor.C \
+ $/sites_score.C \
+ $/splice.C \
+ $/table.C \
+ $/util.C
+
+$/.CXX_SRCS := $(filter %.C,${src})
+$/.CXX_INCS := $(filter %.H,${src})
+$/.CXX_LIBS := $/libsim4.a
+$/.CLEAN := $/*.o
+
+$/libsim4.a: ${$/.CXX_SRCS:.C=.o}
+
+$(eval $/%.d $/%.o: CXXFLAGS += -I${LIBUTL/} -I${LIBBIO/} -I${LIBS4P/})
diff --git a/libsim4/sim4core/Xtend1.C b/libsim4/sim4core/Xtend1.C
new file mode 100644
index 0000000..2648ea2
--- /dev/null
+++ b/libsim4/sim4core/Xtend1.C
@@ -0,0 +1,574 @@
+#include "sim4.H"
+
+// This is used if _accurateSequences is enabled....and it's never
+// enabled. The memory allocations here are NOT optimized.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include "bio.h"
+
+
+void
+Sim4::Xextend_link_to_data_list(void *data, ValNodePtr *head, ValNodePtr *prev) {
+ ValNodePtr curr;
+
+ curr = (ValNodePtr)ckalloc(sizeof(struct ValNode));
+ curr->data = data;
+ curr->next = NULL;
+
+ if(*prev == NULL)
+ *head = curr;
+ else
+ (*prev)->next = curr;
+ *prev = curr;
+}
+
+
+void
+Sim4::Xextend_ValNodeFreeData(ValNodePtr data_list) {
+ ValNodePtr tmp_node;
+
+ while ((tmp_node=data_list)!=NULL) {
+ ckfree(tmp_node->data);
+ data_list = data_list->next;
+ ckfree(tmp_node);
+ }
+}
+
+
+
+int
+Sim4::Xextend_bw(char *s1, char *s2, int m, int n, int offset1, int offset2, int *line1, int *line2)
+{
+ int col, /* column number */
+ row, /* row number */
+ max_d, /* bound on the length of the edit script */
+ d, /* current compressed distance */
+ k, /* current diagonal */
+ DELTA, /* n-m */
+ ORIGIN,
+ lower,
+ upper;
+ int *last_d, *temp_d; /* column containing the last p */
+ int *min_row, *min_diag; /* min (b)/ max (f) row (and diagonal) */
+ /* reached for cost d=0, ... m. */
+ coords ***trace_AG, ***trace_AC;
+ coords *AG_cell, *AC_cell, *newcoords;
+
+ ValNodePtr data_list = NULL, prev = NULL;
+
+ DELTA = n-m;
+ max_d = m+1;
+
+ trace_AG = (coords ***)ckalloc((max_d+1)*sizeof(coords **));
+ trace_AC = (coords ***)ckalloc((max_d+1)*sizeof(coords **));
+ for (d=0; d<=max_d; d++) {
+ trace_AG[d] = (coords **)ckalloc((m+n+1)*sizeof(coords *));
+ trace_AC[d] = (coords **)ckalloc((m+n+1)*sizeof(coords *));
+ }
+
+ ORIGIN = m;
+
+ trace_AG[0][ORIGIN+DELTA] = &last_AG;
+ trace_AC[0][ORIGIN+DELTA] = &last_AC;
+
+ for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--)
+ /*LINTED empty loop body*/;
+ for (k=n; (k>=2) && (k>=col); k--)
+ if (!strncmp((char *)(s2+k-2),"AG",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+
+ newcoords->pos2 = k-DELTA+offset1 +1; /* to compensate for -1 */
+ newcoords->pos1 = k+offset2 +1; /* refer to sim4b1.c */
+ trace_AG[0][ORIGIN+DELTA] = newcoords;
+ } else if (!strncmp((char *)(s2+k-2),"AC",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+
+ newcoords->pos2 = k-DELTA+offset1 +1;
+ newcoords->pos1 = k+offset2 +1;
+ trace_AC[0][ORIGIN+DELTA] = newcoords;
+ }
+
+ if ((row == 0) || (col == 0)) {
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+ (void)memcpy(&last_AG,trace_AG[0][ORIGIN+DELTA],sizeof(coords));
+ (void)memcpy(&last_AC,trace_AC[0][ORIGIN+DELTA],sizeof(coords));
+ Xextend_ValNodeFreeData(data_list);
+ free_coords(trace_AG,max_d+1);
+ free_coords(trace_AC,max_d+1);
+
+ return 0;
+ }
+
+ last_d = (int *)ckalloc((m+n+1)*sizeof(int));
+ temp_d = (int *)ckalloc((m+n+1)*sizeof(int));
+
+ for (k=0; k<=m+n; ++k) last_d[k]=m+1;
+ last_d[ORIGIN+DELTA] = row;
+
+ lower = ORIGIN + DELTA - 1;
+ upper = ORIGIN + DELTA + 1;
+
+ min_row = (int *)ckalloc((m+1)*sizeof(int));
+ min_diag = (int *)ckalloc((m+1)*sizeof(int));
+
+ for (d=1; d<=m; d++)
+ min_row[d] = m+1;
+
+ min_row[0] = last_d[ORIGIN+DELTA];
+ min_diag[0] = ORIGIN + DELTA;
+
+ d = 0;
+ while ((++d<=max_d) &&
+ ((d-1<=good_ratio(m-min_row[d-1])) ||
+ ((d>=2) && (d-2<=good_ratio(m-min_row[d-2]))))) {
+
+ /* for each relevant diagonal ... */
+ for (k = lower; k <= upper; k++) {
+
+ /* find a d on diagonal k */
+ if (k==-d+DELTA+ORIGIN) {
+ /* move down from the last d-1 on diagonal k+1 */
+ row = last_d[k+1];
+ /* op = INSERT; */
+ AG_cell = trace_AG[d-1][k+1];
+ AC_cell = trace_AC[d-1][k+1];
+ } else if (k==d+DELTA+ORIGIN) {
+ /* move right from the last d-1 on diagonal k-1 */
+ row = last_d[k-1]-1;
+ /* op = DELETE; */
+ AG_cell = trace_AG[d-1][k-1];
+ AC_cell = trace_AC[d-1][k-1];
+ } else if ((last_d[k]-1<=last_d[k+1]) &&
+ (last_d[k]-1<=last_d[k-1]-1)) {
+ /* substitution */
+ row = last_d[k]-1;
+ /* op = SUBSTITUTE; */
+ AG_cell = trace_AG[d-1][k];
+ AC_cell = trace_AC[d-1][k];
+ } else if ((last_d[k-1]-1<=last_d[k+1]) &&
+ (last_d[k-1]-1<=last_d[k]-1)) {
+ /* move right from the last d-1 on diagonal k-1 */
+ row = last_d[k-1]-1;
+ /* op = DELETE; */
+ AG_cell = trace_AG[d-1][k-1];
+ AC_cell = trace_AC[d-1][k-1];
+ } else {
+ /* move left from the last d-1 on diagonal k+1 */
+ row = last_d[k+1];
+ /* op = INSERT; */
+ AG_cell = trace_AG[d-1][k+1];
+ AC_cell = trace_AC[d-1][k+1];
+ }
+
+ /* code common to the three cases */
+ /* slide down the diagonal */
+
+ col = row+k-ORIGIN;
+
+ trace_AG[d][k] = AG_cell;
+ trace_AC[d][k] = AC_cell;
+
+ while ((row > 0) && (col > 0) && (s1[row-1]==s2[col-1])) {
+ if ((col>1) && !strncmp((char *)(s2+col-2),"AG",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+
+ newcoords->pos1 = row + k - ORIGIN + offset2 +1;
+ newcoords->pos2 = row + offset1 +1;
+ trace_AG[d][k] = newcoords;
+ } else if ((col>1) && !strncmp((char *)(s2+col-2),"AC",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+
+ newcoords->pos1 = row + k - ORIGIN + offset2 +1;
+ newcoords->pos2 = row + offset1 +1;
+ trace_AC[d][k] = newcoords;
+ }
+ row--; col--;
+ }
+
+ if ((col>1) && !strncmp((char *)(s2+col-2),"AG",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+
+ newcoords->pos1 = row + k - ORIGIN + offset2 +1;
+ newcoords->pos2 = row + offset1 +1;
+ trace_AG[d][k] = newcoords;
+ } else if ((col>1) && !strncmp((char *)(s2+col-2),"AC",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+
+ newcoords->pos1 = row + k - ORIGIN + offset2 +1;
+ newcoords->pos2 = row + offset1 +1;
+ trace_AC[d][k] = newcoords;
+ }
+
+ temp_d[k] = row;
+
+ if ((row == 0) && (col == 0)) {
+ /* hit southeast corner; have the answer */
+
+ (void)memcpy(&last_AG,trace_AG[d][k],sizeof(coords));
+ (void)memcpy(&last_AC,trace_AC[d][k],sizeof(coords));
+
+ ckfree(last_d);
+ ckfree(temp_d);
+ ckfree(min_row);
+ ckfree(min_diag);
+ Xextend_ValNodeFreeData(data_list);
+ free_coords(trace_AG,max_d+1);
+ free_coords(trace_AC,max_d+1);
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ return d;
+ }
+ if (row == 0) {
+ /* hit first row; don't look further */
+
+ (void)memcpy(&last_AG,trace_AG[d][k],sizeof(coords));
+ (void)memcpy(&last_AC,trace_AC[d][k],sizeof(coords));
+
+ ckfree(last_d);
+ ckfree(temp_d);
+ ckfree(min_row);
+ ckfree(min_diag);
+ Xextend_ValNodeFreeData(data_list);
+ free_coords(trace_AG,max_d+1);
+ free_coords(trace_AC,max_d+1);
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ return d;
+ }
+
+ if (col == 0) {
+ /* hit last column; don't look further */
+ (void)memcpy(&last_AG,trace_AG[d][k],sizeof(coords));
+ (void)memcpy(&last_AC,trace_AC[d][k],sizeof(coords));
+
+ ckfree(last_d);
+ ckfree(temp_d);
+ ckfree(min_row);
+ ckfree(min_diag);
+ Xextend_ValNodeFreeData(data_list);
+ free_coords(trace_AG,max_d+1);
+ free_coords(trace_AC,max_d+1);
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ return d;
+ }
+ }
+
+ min_row[d] = last_d[ORIGIN+DELTA];
+ min_diag[d] = ORIGIN+DELTA;
+ for (k=lower; k<=upper; ++k)
+ if (temp_d[k]<min_row[d]) {
+ min_row[d] = temp_d[k];
+ min_diag[d] = k;
+ }
+
+ for (k=lower; k<=upper; k++) {
+ last_d[k] = temp_d[k];
+ }
+
+ --lower;
+ ++upper;
+ }
+
+ /* report here the previous maximal match, stored in min_diag and min_row */
+ while ((d>0) && (min_row[d-1]-min_row[d]<3))
+ d--;
+
+ *line1 = min_row[d]+offset1;
+ *line2 = min_row[d]+min_diag[d]-ORIGIN+offset2;
+
+ (void)memcpy(&last_AG,trace_AG[d][min_diag[d]],sizeof(coords));
+ (void)memcpy(&last_AC,trace_AC[d][min_diag[d]],sizeof(coords));
+
+ ckfree(min_row);
+ ckfree(min_diag);
+ ckfree(last_d);
+ ckfree(temp_d);
+ Xextend_ValNodeFreeData(data_list);
+ free_coords(trace_AG,max_d+1);
+ free_coords(trace_AC,max_d+1);
+
+ return d;
+}
+
+int
+Sim4::Xextend_fw(char *s1, char *s2, int m, int n, int offset1, int offset2, int *line1, int *line2)
+{
+ int col, /* column number */
+ row, /* row number */
+ max_d, /* bound on the length of the edit script */
+ d, /* current compressed distance */
+ k, /* current diagonal */
+ ORIGIN,
+ lower,
+ upper;
+ int *last_d, *temp_d; /* column containing the last p */
+ int *max_row, *max_diag; /* min (b)/ max (f) row (and diagonal) */
+ /* reached for cost d=0, ... m. */
+ coords ***trace_GT, ***trace_CT;
+ coords *GT_cell, *CT_cell, *newcoords;
+
+ ValNodePtr data_list = NULL, prev = NULL;
+
+ max_d = m+1;
+
+ trace_GT = (coords ***)ckalloc((max_d+1)*sizeof(coords **));
+ trace_CT = (coords ***)ckalloc((max_d+1)*sizeof(coords **));
+ for (d=0; d<=max_d; d++) {
+ trace_GT[d] = (coords **)ckalloc((m+n+1)*sizeof(coords *));
+ trace_CT[d] = (coords **)ckalloc((m+n+1)*sizeof(coords *));
+ }
+
+ ORIGIN = m;
+ trace_GT[0][ORIGIN] = &last_GT;
+ trace_CT[0][ORIGIN] = &last_CT;
+
+ for (row=0, col=0; col<n && row<m && (s1[row]==s2[col]); row++, col++)
+ /*LINTED empty loop body*/;
+ for (k=0; (k<=n-2) && (k<=row); k++)
+ if (!strncmp((char *)(s2+k),"GT",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+ newcoords->pos2 = k+offset1;
+ newcoords->pos1 = k+offset2;
+ trace_GT[0][ORIGIN] = newcoords;
+ } else if (!strncmp((char *)(s2+k),"CT",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+ newcoords->pos2 = k+offset1;
+ newcoords->pos1 = k+offset2;
+ trace_CT[0][ORIGIN] = newcoords;
+ }
+
+ if ((row == m) || (col == n)){
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+ (void)memcpy(&last_GT,trace_GT[0][ORIGIN],sizeof(coords));
+ (void)memcpy(&last_CT,trace_CT[0][ORIGIN],sizeof(coords));
+ Xextend_ValNodeFreeData(data_list);
+ free_coords(trace_GT,max_d+1);
+ free_coords(trace_CT,max_d+1);
+
+ return 0;
+ }
+
+ last_d = (int *)ckalloc((m+n+1)*sizeof(int));
+ temp_d = (int *)ckalloc((m+n+1)*sizeof(int));
+
+ for (k=0; k<=m+n; ++k) last_d[k]=-1;
+ last_d[ORIGIN] = row;
+
+ lower = ORIGIN - 1;
+ upper = ORIGIN + 1;
+
+ max_row = (int *)ckalloc((m+1)*sizeof(int));
+ max_diag = (int *)ckalloc((m+1)*sizeof(int));
+
+ for (d=1; d<=m; d++)
+ max_row[d] = -1;
+
+ max_row[0] = last_d[ORIGIN];
+ max_diag[0] = ORIGIN;
+
+
+ d = 0;
+ while ((++d<=max_d) &&
+ ((d-1<=good_ratio(max_row[d-1])) ||
+ ((d>=2) && (d-2<=good_ratio(max_row[d-2]))))) {
+
+ /* for each relevant diagonal ... */
+ for (k = lower; k <= upper; k++) {
+
+ /* find a d on diagonal k */
+ if (k==-d+ORIGIN) {
+ /* move down from the last d-1 on diagonal k+1 */
+ row = last_d[k+1]+1;
+ /* op = DELETE; */
+ GT_cell = trace_GT[d-1][k+1];
+ CT_cell = trace_CT[d-1][k+1];
+ } else if (k==d+ORIGIN) {
+ /* move right from the last d-1 on diagonal k-1 */
+ row = last_d[k-1];
+ /* op = INSERT; */
+ GT_cell = trace_GT[d-1][k-1];
+ CT_cell = trace_CT[d-1][k-1];
+ } else if ((last_d[k]>=last_d[k+1]) &&
+ (last_d[k]+1>=last_d[k-1])) {
+ /* substitution */
+ row = last_d[k]+1;
+ /* op = SUBSTITUTE; */
+ GT_cell = trace_GT[d-1][k];
+ CT_cell = trace_CT[d-1][k];
+ } else if ((last_d[k+1]+1>=last_d[k-1]) &&
+ (last_d[k+1]>=last_d[k])) {
+ /* move down from the last d-1 on diagonal k+1 */
+ row = last_d[k+1]+1;
+ /* op = DELETE; */
+ GT_cell = trace_GT[d-1][k+1];
+ CT_cell = trace_CT[d-1][k+1];
+ } else {
+
+ /* move right from the last d-1 on diagonal k-1 */
+ row = last_d[k-1];
+ /* op = INSERT; */
+ GT_cell = trace_GT[d-1][k-1];
+ CT_cell = trace_CT[d-1][k-1];
+ }
+
+ /* code common to the three cases */
+ /* slide down the diagonal */
+
+ col = row+k-ORIGIN;
+
+ trace_GT[d][k] = GT_cell;
+ trace_CT[d][k] = CT_cell;
+
+ if (row>=0)
+ while ((row < m) && (col < n) && (s1[row]==s2[col])) {
+ if ((col<n-1) && !strncmp((char *)(s2+col),"GT",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+
+ newcoords->pos1 = row + k - ORIGIN + offset2;
+ newcoords->pos2 = row + offset1;
+ trace_GT[d][k] = newcoords;
+ } else if ((col<n-1) && !strncmp((char *)(s2+col),"CT",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+
+ newcoords->pos1 = row + k - ORIGIN + offset2;
+ newcoords->pos2 = row + offset1;
+ trace_CT[d][k] = newcoords;
+ }
+
+ row++; col++;
+ }
+
+ if ((col<n-1) && !strncmp((char *)(s2+col),"GT",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+
+ newcoords->pos1 = row + k - ORIGIN + offset2;
+ newcoords->pos2 = row + offset1;
+ trace_GT[d][k] = newcoords;
+ } else if ((col<n-1) && !strncmp((char *)(s2+col),"CT",2)) {
+ newcoords = (coords *)ckalloc(sizeof(coords));
+ Xextend_link_to_data_list((void *)newcoords, &data_list, &prev);
+
+ newcoords->pos1 = row + k - ORIGIN + offset2;
+ newcoords->pos2 = row + offset1;
+ trace_CT[d][k] = newcoords;
+ }
+
+ temp_d[k] = row;
+
+ if ((row == m) && (col == n)) {
+ /* hit southeast corner; have the answer */
+ (void)memcpy(&last_GT,trace_GT[d][k],sizeof(coords));
+ (void)memcpy(&last_CT,trace_CT[d][k],sizeof(coords));
+
+ Xextend_ValNodeFreeData(data_list);
+ free_coords(trace_GT,max_d+1);
+ free_coords(trace_CT,max_d+1);
+ ckfree(last_d);
+ ckfree(temp_d);
+ ckfree(max_row);
+ ckfree(max_diag);
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ return d;
+ }
+ if (row == m) {
+ /* hit last row; don't look further */
+ (void)memcpy(&last_GT,trace_GT[d][k],sizeof(coords));
+ (void)memcpy(&last_CT,trace_CT[d][k],sizeof(coords));
+
+ Xextend_ValNodeFreeData(data_list);
+ free_coords(trace_GT,max_d+1);
+ free_coords(trace_CT,max_d+1);
+ ckfree(temp_d);
+ ckfree(last_d);
+ ckfree(max_row);
+ ckfree(max_diag);
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ return d;
+ }
+
+ if (col == n) {
+ /* hit last column; don't look further */
+ (void)memcpy(&last_GT,trace_GT[d][k],sizeof(coords));
+ (void)memcpy(&last_CT,trace_CT[d][k],sizeof(coords));
+
+ Xextend_ValNodeFreeData(data_list);
+ free_coords(trace_GT,max_d+1);
+ free_coords(trace_CT,max_d+1);
+
+ ckfree(temp_d);
+ ckfree(last_d);
+ ckfree(max_row);
+ ckfree(max_diag);
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ return d;
+ }
+ }
+ max_row[d] = last_d[ORIGIN];
+ max_diag[d] = ORIGIN;
+ for (k=lower; k<=upper; ++k)
+ if (temp_d[k]>max_row[d]) {
+ max_row[d] = temp_d[k];
+ max_diag[d] = k;
+ }
+
+ for (k=lower; k<=upper; k++) {
+ last_d[k] = temp_d[k];
+ }
+
+ --lower;
+ ++upper;
+ }
+
+ /* report here the previous maximal match, stored in max_diag and max_row */
+
+ while ((d>0) && (max_row[d]-max_row[d-1]<3))
+ d--;
+
+ *line1 = max_row[d]+offset1;
+ *line2 = max_row[d]+max_diag[d]-ORIGIN+offset2;
+
+ (void)memcpy(&last_GT,trace_GT[d][max_diag[d]],sizeof(coords));
+ (void)memcpy(&last_CT,trace_CT[d][max_diag[d]],sizeof(coords));
+
+ ckfree(max_row);
+ ckfree(max_diag);
+ ckfree(last_d);
+ ckfree(temp_d);
+ Xextend_ValNodeFreeData(data_list);
+ free_coords(trace_GT,max_d+1);
+ free_coords(trace_CT,max_d+1);
+
+
+ return d;
+}
diff --git a/libsim4/sim4core/align.C b/libsim4/sim4core/align.C
new file mode 100644
index 0000000..5237ebe
--- /dev/null
+++ b/libsim4/sim4core/align.C
@@ -0,0 +1,848 @@
+#include "sim4.H"
+
+// only for debugging
+#include <sys/types.h>
+#include <signal.h>
+
+#if defined (__SVR4) && defined (__sun)
+// Solaris defines SS in sys/regset.h
+#undef SS
+#endif
+
+// Define this to do bounds checking on the arrays used here
+//#define CHECK_BOUNDS
+
+
+#ifdef CHECK_BOUNDS
+class boundedIntArray {
+public:
+ boundedIntArray(int offset, int size) {
+ //fprintf(stderr, "boundedIntArray: offset=%d size=%d\n", offset, size);
+ _o = offset;
+ _m = size;
+ _a = new int [_m];
+
+ bzero(_a, sizeof(int) * _m);
+
+ _a -= _o;
+ };
+ ~boundedIntArray() {
+ _a += _o;
+ delete [] _a;
+ };
+
+ int &operator[](int i) {
+ if (i < _o) {
+ fprintf(stderr, "********** i=%d o=%d\n", i, _o);
+ exit(1);
+ }
+ if (i >= _o + _m) {
+ fprintf(stderr, "********** i=%d o=%d m=%d\n", i, _o, _m);
+ exit(1);
+ }
+ return(_a[i]);
+ };
+
+ int _o;
+ int _m;
+ int *_a;
+ int _crud;
+};
+#endif
+
+
+
+int
+Sim4::align_get_dist(int i1, int j1, int i2, int j2, int limit) {
+
+ // Compute the boundary diagonals
+ int start = j1 - i1;
+ int lower = MAX(j1-i2, start-limit);
+ int upper = MIN(j2-i1, start+limit);
+ int goal_diag = j2-i2;
+
+ if (goal_diag > upper || goal_diag < lower)
+ return(-1);
+
+ // Allocate space for forward vectors
+#ifdef CHECK_BOUNDS
+ boundedIntArray last_d(lower, upper-lower+1);
+ boundedIntArray temp_d(lower, upper-lower+1);
+#else
+ int *last_d = (int *)ckalloc((upper-lower+1) * sizeof(int)) - lower;
+ int *temp_d = (int *)ckalloc((upper-lower+1) * sizeof(int)) - lower;
+#endif
+
+ // Initialization -- it's set to an easy to recognize value for
+ // debugging.
+ for (int k=lower; k<=upper; ++k)
+ last_d[k] = -2109876543;
+
+ last_d[start] = snake(start, i1, i2, j2);
+
+ if (last_d[goal_diag] >= i2) {
+#ifndef CHECK_BOUNDS
+ ckfree(last_d+lower);
+ ckfree(temp_d+lower);
+#endif
+ return(0);
+ }
+
+ for (int c=1; c<=limit; ++c) {
+ int ll = MAX(lower,start-c);
+ int uu = MIN(upper, start+c);
+
+ for (int k=ll; k<=uu; ++k) {
+ int row;
+
+ if (k == ll)
+ row = last_d[k+1]+1; // DELETE
+ else if (k == uu)
+ row = last_d[k-1]; // INSERT
+ else if ((last_d[k]>=last_d[k+1]) &&
+ (last_d[k]+1>=last_d[k-1]))
+ row = last_d[k]+1; // SUBSTITUTE
+ else if ((last_d[k+1]+1>=last_d[k-1]) &&
+ (last_d[k+1]>=last_d[k]))
+ row = last_d[k+1]+1; // DELETE
+ else
+ row = last_d[k-1]; // INSERT
+
+ temp_d[k] = snake(k,row,i2,j2);
+ }
+
+ for (int k=ll; k<=uu; ++k)
+ last_d[k] = temp_d[k];
+
+ if (last_d[goal_diag] >= i2) {
+ // Free working vectors
+#ifndef CHECK_BOUNDS
+ ckfree(last_d+lower);
+ ckfree(temp_d+lower);
+#endif
+ return(c);
+ }
+ }
+
+#ifndef CHECK_BOUNDS
+ ckfree(last_d+lower);
+ ckfree(temp_d+lower);
+#endif
+
+ // Ran out of distance limit
+ return(-1);
+}
+
+
+
+int
+Sim4::get_dist(int i1, int j1, int i2, int j2, int limit)
+{
+ int *SS, *DD, *II;
+ int goal_diag;
+ int c, k, t1, t2, t;
+ int start, lower, upper;
+
+ /* Compute the boundary diagonals */
+ start = j1 - i1;
+ lower = MAX(j1-i2, start-limit);
+ upper = MIN(j2-i1, start+limit);
+ goal_diag = j2-i2;
+
+ if (goal_diag > upper || goal_diag < lower) {
+ fprintf(stderr, "The two sequences are not really similar.(1 %d; %d %d %d %d)\n", limit, i1, j1, i2, j2);
+ fprintf(stderr, "Please try exact phase 1 method\n.");
+
+#ifndef CHECK_BOUNDS
+ /* Free working vectors */
+ ckfree(SS+lower);
+ ckfree(DD+lower);
+ ckfree(II+lower-1);
+#endif
+ return -1;
+ }
+
+ /* Allocate space for forward vectors */
+#ifdef CHECK_BOUNDS
+ boundedIntArray SS(lower, upper-lower+1);
+ boundedIntArray DD(lower, upper-lower+2);
+ boundedIntArray II(lower-1, upper-lower+2);
+#else
+ SS = (int *)ckalloc((upper-lower+1)*sizeof(int)) - lower;
+ DD = (int *)ckalloc((upper-lower+2)*sizeof(int)) - lower;
+ II = (int *)ckalloc((upper-lower+2)*sizeof(int)) - lower + 1;
+#endif
+
+ /* Initialization */
+ for (k=lower; k<=upper; ++k) SS[k] = -99999;
+ for (k=lower; k<=upper+1; ++k) DD[k] = -99999;
+ for (k=lower-1; k<=upper; ++k) II[k] = -99999;
+ SS[start] = snake(start, i1, i2, j2);
+
+ if (SS[goal_diag] >= i2) {
+#ifdef STATS
+ printf("get_dist = %d\n", 0);
+#endif
+
+#ifndef CHECK_BOUNDS
+ /* Free working vectors */
+ ckfree(SS+lower);
+ ckfree(DD+lower);
+ ckfree(II+lower-1);
+#endif
+ return 0;
+ }
+
+ for (c=1; c<=limit; ++c) {
+ t = MAX(lower, start-c);
+ t1 = II[t-1];
+ for (k=t; k<=MIN(upper, start+c); ++k) {
+ t2 = II[k];
+ II[k] = MAX(t1, SS[k]);
+ t1 = t2;
+ DD[k] = MAX(DD[k+1]+1, SS[k]);
+ SS[k] = snake(k, MIN(j2-k,MAX(MAX(SS[k]+1, II[k]), DD[k]
+)), i2, j2);
+ }
+
+ if (SS[goal_diag] >= i2) {
+#ifdef STATS
+ printf("get_dist = %d\n", c);
+#endif
+
+#ifndef CHECK_BOUNDS
+ /* Free working vectors */
+ ckfree(SS+lower);
+ ckfree(DD+lower);
+ ckfree(II+lower-1);
+#endif
+ return c;
+ }
+ }
+
+ /* Ran out of distance limit */
+ fprintf(stderr, "Two sequences are not really similar.\n");
+ fprintf(stderr, "Please try exact phase 1.\n");
+
+ return -1;
+}
+
+
+void
+Sim4::align_path(int i1, int j1,
+ int i2, int j2,
+ int dist,
+ edit_script **head,
+ edit_script **tail) {
+#ifndef CHECK_BOUNDS
+ int *last_d = 0L;
+ int *temp_d = 0L;
+ int *rlast_d = 0L;
+ int *rtemp_d = 0L;
+#endif
+ edit_script *head1 = 0L;
+ edit_script *tail1 = 0L;
+ edit_script *head2 = 0L;
+ edit_script *tail2 = 0L;
+
+ //fprintf(stderr, "align_path()-- i1=%d j1=%d i2=%d j2=%d dist=%d\n", i1, j1, i2, j2, dist);
+
+ int ll=0;
+ int uu=0;
+
+ *head = *tail = NULL;
+
+ // Boundary cases
+ if (i1 == i2) {
+ if (j1 == j2) {
+ *head = NULL;
+ } else {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = INSERT;
+ head1->num = j2-j1;
+ head1->next = NULL;
+ *head = *tail = head1;
+ }
+ return;
+ }
+
+ if (j1 == j2) {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = DELETE;
+ head1->num = i2-i1;
+ head1->next = NULL;
+ *head = *tail = head1;
+ return;
+ }
+
+ if (dist <= 1) {
+ int start = j1-i1;
+ if (j2-i2 == j1-i1) {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = SUBSTITUTE;
+ head1->num = i2-i1;
+ head1->next = NULL;
+ *head = *tail = head1;
+ } else if (j2-j1 == i2-i1+1) {
+
+ int tmp = snake(start,i1,i2,j2);
+ if (tmp>i1) {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = SUBSTITUTE;
+ head1->num = tmp-i1;
+ *head = head1;
+ }
+ head2 = (edit_script *) ckalloc(sizeof(edit_script));
+ head2->op_type = INSERT;
+ head2->num = 1;
+
+ if (*head) head1->next = head2;
+ else *head = head2;
+ *tail = head2;
+ head2->next = NULL;
+
+ if (i2-tmp) {
+ head1 = head2;
+ *tail = head2 = (edit_script *)ckalloc(sizeof(edit_script));
+ head2->op_type = SUBSTITUTE;
+ head2->num = i2-tmp;
+ head2->next = NULL;
+ head1->next = head2;
+ }
+ } else if (j2-j1+1 == i2-i1) {
+
+ int tmp = snake(start,i1,i2,j2);
+ if (tmp>i1) {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = SUBSTITUTE;
+ head1->num = tmp-i1;
+ *head = head1;
+ }
+ head2 = (edit_script *) ckalloc(sizeof(edit_script));
+ head2->op_type = DELETE;
+ head2->num = 1;
+
+ if (*head) head1->next = head2;
+ else *head = head2;
+ *tail = head2;
+ head2->next = NULL;
+
+ if (i2>tmp+1) {
+ head1 = head2;
+ *tail = head2 = (edit_script *)ckalloc(sizeof(edit_script));
+ head2->op_type = SUBSTITUTE;
+ head2->num = i2-tmp-1;
+ head2->next = NULL;
+ head1->next = head2;
+ }
+ } else {
+ fprintf(stderr, "Sim4::align_path()-- warning: something wrong when aligning.");
+ *head = 0L;
+ *tail = 0L;
+ }
+ return;
+ }
+
+ // Divide the problem at the middle cost
+ int midc = dist/2;
+ int rmidc = dist - midc;
+
+ // Compute the boundary diagonals
+ int start = j1 - i1;
+ int lower = MAX(j1-i2, start-midc);
+ int upper = MIN(j2-i1, start+midc);
+ int rstart = j2-i2;
+ int rlower = MAX(j1-i2, rstart-rmidc);
+ int rupper = MIN(j2-i1, rstart+rmidc);
+
+
+#if 0
+ fprintf(stderr, "dist = %d\n", dist);
+ fprintf(stderr, "midc = %d rmidc = %d\n", midc, rmidc);
+ fprintf(stderr, "j1 = %d\n", j1);
+ fprintf(stderr, "i1 = %d\n", i1);
+ fprintf(stderr, "j2 = %d\n", j2);
+ fprintf(stderr, "i2 = %d\n", i2);
+ fprintf(stderr, "start = %d lower = %d upper = %d\n", start, lower, upper);
+ fprintf(stderr, "rstart = %d rlower = %d rupper = %d\n", rstart, rlower, rupper);
+#endif
+
+
+ // Allocate space for forward vectors
+#ifdef CHECK_BOUNDS
+ boundedIntArray last_d(lower, upper-lower+1);
+ boundedIntArray temp_d(lower, upper-lower+1);
+#else
+ last_d = (int *)ckalloc((upper-lower+1)*sizeof(int)) - lower;
+ temp_d = (int *)ckalloc((upper-lower+1)*sizeof(int)) - lower;
+#endif
+
+ for (int k=lower; k<=upper; k++)
+ last_d[k] = -1;
+
+ last_d[start] = snake(start,i1,i2,j2);
+
+ // Forward computation
+ for (int c=1; c<=midc; ++c) {
+ ll = MAX(lower,start-c);
+ uu = MIN(upper,start+c);
+ //fprintf(stderr, "c=%d ll=%d uu=%d\n", c, ll, uu);
+ for (int k=ll; k<=uu; ++k) {
+ int row;
+
+ if (k == ll) {
+ // DELETE : down from (k+1,c-1)
+ row = last_d[k+1]+1;
+ } else if (k == uu) {
+ // INSERT : right from (k-1,c-1)
+ row = last_d[k-1];
+ } else if ((last_d[k]>=last_d[k+1]) &&
+ (last_d[k]+1>=last_d[k-1])) {
+ // SUBSTITUTE
+ row = last_d[k]+1;
+ } else if ((last_d[k+1]+1>=last_d[k-1]) &&
+ (last_d[k+1]>=last_d[k])) {
+ // DELETE
+ row = last_d[k+1]+1;
+ } else {
+ // INSERT
+ row = last_d[k-1];
+ }
+
+ temp_d[k] = snake(k,row,i2,j2);
+
+ //fprintf(stderr, "k=%d row=%d temp_d[k]=%d\n", k, row, temp_d[k]);
+ }
+ for (int k=ll; k<=uu; ++k)
+ last_d[k] = temp_d[k];
+ }
+
+ // Allocate space for backward vectors
+#ifdef CHECK_BOUNDS
+ boundedIntArray rlast_d(rlower, rupper-rlower+1);
+ boundedIntArray rtemp_d(rlower, rupper-rlower+1);
+#else
+ rlast_d = (int *)ckalloc((rupper-rlower+1)*sizeof(int)) - rlower;
+ rtemp_d = (int *)ckalloc((rupper-rlower+1)*sizeof(int)) - rlower;
+#endif
+
+ for (int k=rlower; k<=rupper; k++)
+ rlast_d[k] = i2+1;
+
+ rlast_d[rstart] = rsnake(rstart,i2,i1,j1,i2+1);
+
+ // Backward computation
+ for (int c=1; c<=rmidc; ++c) {
+ ll = MAX(rlower,rstart-c);
+ uu = MIN(rupper,rstart+c);
+ for (int k=ll; k<=uu; ++k) {
+ int row;
+
+ if (k == ll) {
+ // INSERT : left from (k+1,c-1)
+ row = rlast_d[k+1];
+ } else if (k == uu) {
+ // DELETE : up from (k-1,c-1)
+ row = rlast_d[k-1]-1;
+ } else if ((rlast_d[k]-1<=rlast_d[k+1]) &&
+ (rlast_d[k]-1<=rlast_d[k-1]-1)) {
+ // SUBSTITUTE
+ row = rlast_d[k]-1;
+ } else if ((rlast_d[k-1]-1<=rlast_d[k+1]) &&
+ (rlast_d[k-1]-1<=rlast_d[k]-1)) {
+ // DELETE
+ row = rlast_d[k-1]-1;
+ } else {
+ // INSERT
+ row = rlast_d[k+1];
+ }
+
+ rtemp_d[k] = rsnake(k,row,i1,j1,i2+1);
+ }
+ for (int k=ll; k<=uu; ++k)
+ rlast_d[k] = rtemp_d[k];
+ }
+
+ // Find (mi, mj) such that the distance from (i1, j1) to (mi, mj)
+ // is midc and the distance from (mi, mj) to (i2, j2) is rmidc.
+
+ int flag = 0;
+ int mi = 0;
+ int mj = 0;
+
+ ll = MAX(lower,rlower);
+ uu = MIN(upper,rupper);
+
+ //fprintf(stderr, "ll=%d uu=%d\n", ll, uu);
+
+ for (int k=ll; k<=uu; ++k) {
+ //fprintf(stderr, "last_d[%d] = %d rlast_d[%d] = %d\n", k, last_d[k], k, rlast_d[k]);
+
+ if (last_d[k] >= rlast_d[k]) {
+ if (last_d[k] - i1 >= i2 - rlast_d[k]) {
+ mi = last_d[k];
+ mj = k+mi;
+ } else {
+ mi = rlast_d[k];
+ mj = k+mi;
+ }
+
+ flag = 1;
+
+ break;
+ }
+ }
+
+#ifndef CHECK_BOUNDS
+ ckfree(last_d + lower);
+ ckfree(rlast_d + rlower);
+ ckfree(temp_d + lower);
+ ckfree(rtemp_d + rlower);
+#endif
+
+ //fprintf(stderr, "flag=%d mi=%d mj=%d\n", flag, mi, mj);
+
+ if (flag == 0) {
+ //fprintf(stderr, "Sim4::align_path()-- warning: something wrong when dividing\n");
+
+#if 0
+ // Pick the middle k, keep going.
+
+ int k= ll + (uu-ll) / 2;
+
+ if (last_d[k] - i1 >= i2 - rlast_d[k]) {
+ mi = last_d[k];
+ mj = k+mi;
+ } else {
+ mi = rlast_d[k];
+ mj = k+mi;
+ }
+
+#else
+ //kill(getpid(), SIGSEGV);
+ *head = 0L;
+ *tail = 0L;
+ return;
+#endif
+ }
+
+
+ // Find a path from (i1,j1) to (mi,mj)
+ align_path(i1,j1,mi,mj,midc,&head1,&tail1);
+
+ // Find a path from (mi,mj) to (i2,j2)
+ align_path(mi,mj,i2,j2,rmidc,&head2,&tail2);
+
+ // Join these two paths together
+ if (head1)
+ tail1->next = head2;
+ else
+ head1 = head2;
+
+ *head = head1;
+
+ if (head2)
+ *tail = tail2;
+ else
+ *tail = tail1;
+}
+
+
+void
+Sim4::path(int i1, int j1, char type1, int i2, int j2, char type2, int dist, edit_script **head, edit_script **tail)
+{
+ int *SS, *DD, *II; /* Forward vectors */
+ int *RS, *RD, *RI; /* Backward vectors */
+
+ edit_script *head1, *tail1, *head2, *tail2;
+ int midc, rmidc;
+ int start, lower, upper;
+ int rstart, rlower, rupper;
+ int c, k, t1, t2, t;
+ int maxint;
+ int mi, mj, mtype;
+ char flag;
+
+/*
+printf("i1=%d,j1=%d,type1=%d,i2=%d,j2=%d,type2=%d,dist=%d\n",i1,j1,type1,i2,j2,type2,dist);
+*/
+
+ /* Boundary cases */
+ if (i1 == i2) {
+ if (j1 == j2) *head = NULL;
+ else {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = INSERT;
+ head1->num = j2-j1;
+ head1->next = NULL;
+ *head = *tail = head1;
+ }
+ return;
+ }
+
+ if (j1 == j2) {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = DELETE;
+ head1->num = i2-i1;
+ head1->next = NULL;
+ *head = *tail = head1;
+ return;
+ }
+
+ if (dist <= 1) {
+ if (j2-i2 == j1-i1) {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = SUBSTITUTE;
+ head1->num = i2-i1;
+ head1->next = NULL;
+ *head = *tail = head1;
+ } else if (j2-i2 > j1-i1) {
+ if (type1 == INSERT) {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = INSERT;
+ head1->num = 1;
+ head2 = (edit_script *) ckalloc(sizeof(edit_script));
+ head2->op_type = SUBSTITUTE;
+ head2->num = i2-i1;
+ } else {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = SUBSTITUTE;
+ head1->num = i2-i1;
+ head2 = (edit_script *) ckalloc(sizeof(edit_script));
+ head2->op_type = INSERT;
+ head2->num = 1;
+ }
+ head1->next = head2;
+ head2->next = NULL;
+ *head = head1;
+ *tail = head2;
+ } else if (j2-i2 < j1-i1) {
+ if (type1 == DELETE) {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = DELETE;
+ head1->num = 1;
+ head2 = (edit_script *) ckalloc(sizeof(edit_script));
+ head2->op_type = SUBSTITUTE;
+ head2->num = j2-j1;
+ } else {
+ head1 = (edit_script *) ckalloc(sizeof(edit_script));
+ head1->op_type = SUBSTITUTE;
+ head1->num = j2-j1;
+ head2 = (edit_script *) ckalloc(sizeof(edit_script));
+ head2->op_type = DELETE;
+ head2->num = 1;
+ }
+ head1->next = head2;
+ head2->next = NULL;
+ *head = head1;
+ *tail = head2;
+ }
+ return;
+ }
+
+ /* Divide the problem at the middle cost */
+ midc = dist/2;
+ rmidc = dist - midc;
+
+ /* Compute the boundary diagonals */
+ start = j1 - i1;
+ lower = MAX(j1-i2, start-midc);
+ upper = MIN(j2-i1, start+midc);
+ rstart = j2-i2;
+ rlower = MAX(j1-i2, rstart-rmidc);
+ rupper = MIN(j2-i1, rstart+rmidc);
+
+ /* Allocate space for forward vectors */
+ SS = (int *)ckalloc((upper-lower+1)*sizeof(int)) - lower;
+ DD = (int *)ckalloc((upper-lower+2)*sizeof(int)) - lower;
+ II = (int *)ckalloc((upper-lower+2)*sizeof(int)) - lower + 1;
+
+ /* Forward computation */
+ for (k=lower; k<=upper; ++k) SS[k] = -99999;
+ for (k=lower; k<=upper+1; ++k) DD[k] = -99999;
+ for (k=lower-1; k<=upper; ++k) II[k] = -99999;
+ if (type1 == SUBSTITUTE) SS[start] = snake(start, i1, i2, j2);
+ else if (type1 == DELETE) {
+ DD[start] = i1;
+ SS[start] = snake(start,i1,i2,j2);
+ } else {
+ II[start] = i1;
+ SS[start] = snake(start,i1,i2,j2);
+ }
+
+ for (c=1; c<=midc; ++c) {
+ t = MAX(lower, start-c);
+ t1 = II[t-1];
+ for (k=t; k<=MIN(upper, start+c); ++k) {
+ t2 = II[k];
+ II[k] = MAX(t1, SS[k]);
+ t1 = t2;
+ DD[k] = MAX(DD[k+1]+1, SS[k]);
+ SS[k] = snake(k, MIN(j2-k,MAX(MAX(SS[k]+1, II[k]), DD[k])), i2, j2);
+ }
+ }
+
+ /* Allocate space for backward vectors */
+ RS = (int *)ckalloc((rupper-rlower+1)*sizeof(int)) - rlower;
+ RD = (int *)ckalloc((rupper-rlower+2)*sizeof(int)) - rlower + 1;
+ RI = (int *)ckalloc((rupper-rlower+2)*sizeof(int)) - rlower;
+
+ /* Backward computation */
+ maxint = i2 + dist + _estLen;
+ for (k=rlower; k<=rupper; ++k) RS[k] = maxint;
+ for (k=rlower-1; k<=rupper; ++k) RD[k] = maxint;
+ for (k=rlower; k<=rupper+1; ++k) RI[k] = maxint;
+ if (type2 == SUBSTITUTE)
+ RI[rstart] = RD[rstart] = RS[rstart] = rsnake(rstart, i2, i1, j1, i2+1);
+ else if (type2 == DELETE) RD[rstart] = i2;
+ else RI[rstart] = i2;
+
+ for (c=1; c<=rmidc; ++c) {
+ t = MAX(rlower, rstart-c);
+ t1 = RD[t-1];
+ for (k=t; k<=MIN(rupper, rstart+c); ++k) {
+#if 0
+ int x = MIN(MIN(RS[k]-1,RD[k]),RI[k]);
+ printf("<<<%d>>>", x);
+ assert(0<=x);
+ assert (x<=_estLen);
+ printf("%d", x);
+#endif
+ RS[k] = rsnake(k, MAX(j1-k, MIN(MIN(RS[k]-1,RD[k]),RI[k])),i1,j1,i2+1);
+ t2 = RD[k];
+ RD[k] = MIN(t1-1, RS[k]);
+ t1 = t2;
+ RI[k] = MIN(RI[k+1], RS[k]);
+ }
+ }
+
+ /* Find (mi, mj, mtype) such that
+ the distance from (i1, j1, type1) to (mi, mj, mtype) is midc
+ and the distance from (mi, mj, mtype) to (i2, j2, type2) is rmidc.
+ */
+
+ flag = 0;
+ for (k=MAX(lower,rlower); k<=MIN(upper,rupper);++k) {
+
+/*
+printf("k=%d, SS=%d, RS=%d, DD=%d, RD=%d, II=%d, RI=%d\n",k,SS[k],RS[k],DD[k],RD[k],II[k],RI[k]);
+*/
+
+ if (SS[k]>=RS[k] || DD[k]>=RD[k] || II[k]>=RI[k]) {
+ if (DD[k]>=RD[k]) {
+ mi = DD[k];
+ mj = k+mi;
+ mtype = DELETE;
+ } else if (II[k] >= RI[k]) {
+ mi = II[k];
+ mj = k+mi;
+ mtype = INSERT;
+ } else {
+ mi = SS[k];
+ mj = k+mi;
+ mtype = SUBSTITUTE;
+ }
+
+/*
+ printf("mi=%d, mj=%d, mtype=%d\n", mi, mj, mtype);
+*/
+ flag = 1;
+ break;
+ }
+ }
+
+ /* Free working vectors */
+ ckfree(SS+lower);
+ ckfree(DD+lower);
+ ckfree(II+lower-1);
+ ckfree(RS+rlower);
+ ckfree(RD+rlower-1);
+ ckfree(RI+rlower);
+
+ if (flag) {
+ /* Find a path from (i1,j1,type1) to (mi,mj,mtype) */
+ path(i1,j1,type1,mi,mj,mtype,midc,&head1,&tail1);
+
+ /* Find a path from (mi,mj,mtype) to (i2,j2,type2) */
+ path(mi,mj,mtype,i2,j2,type2,rmidc,&head2,&tail2);
+
+ /* Join these two paths together */
+ if (head1) tail1->next = head2;
+ else head1 = head2;
+ } else {
+ printf("Something wrong when dividing\n");
+ head1 = NULL;
+ }
+ *head = head1;
+ if (head2) *tail = tail2;
+ else *tail = tail1;
+}
+
+
+
+// Condense_script - merge contiguous operations of the same type together
+void
+Sim4::Condense_script(edit_script *head)
+{
+ edit_script *tp, *tp1;
+
+ tp = head;
+ while (tp != NULL) {
+ while (((tp1 = tp->next) != NULL) && (tp->op_type == tp1->op_type)) {
+ tp->num = tp->num + tp1->num;
+ tp->next = tp1->next;
+ ckfree(tp1);
+ }
+ tp = tp->next;
+ }
+}
+
+// Flip_script - reverse the script list
+void
+Sim4::Flip_script(struct edit_script **script)
+{
+ struct edit_script *ep, *ahead, *behind;
+
+ ahead = *script;
+ ep = NULL;
+ while (ahead!=NULL) {
+ behind = ep;
+ ep = ahead;
+ ahead = ahead->next;
+ ep->next = behind;
+ }
+ *script = ep;
+}
+
+
+
+void
+Sim4::align_reverse(int *S)
+{
+ int auxi, *begi, *endi;
+
+ begi = S; endi = S + *(S-1);
+ while (begi < endi) {
+ auxi = *begi;
+ *begi = *--endi;
+ *endi = auxi;
+ begi++;
+ }
+ return;
+}
+
+
+
+void
+Sim4::Free_script(edit_script *head)
+{
+ edit_script *tp, *tp1;
+
+ tp = head;
+ while (tp != NULL) {
+ tp1 = tp->next;
+ ckfree(tp);
+ tp = tp1;
+ }
+}
+
diff --git a/libsim4/sim4core/exon.H b/libsim4/sim4core/exon.H
new file mode 100644
index 0000000..720f85f
--- /dev/null
+++ b/libsim4/sim4core/exon.H
@@ -0,0 +1,178 @@
+#ifndef EXON_H
+#define EXON_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// Define this only if Liliana is watching you.
+//#define SPLSCORE
+
+class Exon {
+private:
+ Exon() {
+ next_exon = 0L;
+ frGEN = 0;
+ frEST = 0;
+ toGEN = 0;
+ toEST = 0;
+ ori = 'U';
+ length = 0;
+
+ edist = 0;
+ flag = 0;
+
+ percentID = 0;
+ alignmentLength = 0;
+ numMatches = 0;
+ numNs = 0;
+ numInDel = 0;
+ numEdits = 0;
+
+#ifdef SPLSCORE
+ splScore = -999999;
+#endif
+ };
+ ~Exon() {};
+
+ void init(int f1, int f2, int t1, int t2, int len, int edost, int flog, Exon *next) {
+ next_exon = next;
+
+ frGEN = f1;
+ frEST = f2;
+ toGEN = t1;
+ toEST = t2;
+
+ ori = 'U';
+ length = (len < 0) ? (t2-f2+1) : len;
+
+ edist = edost;
+ flag = flog;
+
+ percentID = 0;
+ alignmentLength = 0;
+ numMatches = 0;
+ numNs = 0;
+ numInDel = 0;
+ numEdits = 0;
+
+#ifdef SPLSCORE
+ splScore = -999999;
+#endif
+ };
+
+ friend class exonManager;
+
+public:
+ void printList(char *label) {
+ Exon *l = this;
+
+ fprintf(stdout, "%s", label);
+ while (l) {
+ fprintf(stdout, "GEN f=%8d t=%8d EST f=%8d t=%8d flag=%d\n",
+ l->frGEN,
+ l->toGEN,
+ l->frEST,
+ l->toEST,
+ l->flag);
+
+ l = l->next_exon;
+ }
+ fprintf(stdout, "----------------------------------------\n");
+ fflush(stdout);
+ };
+
+ Exon *next_exon;
+
+ int frGEN, toGEN; // Genomic coords
+ int frEST, toEST; // cDNA coords
+
+ int ori;
+ int length;
+
+ // - 'flag' controls whether the second blast pass should be run
+ // or not on the adjacent (unmatched) fragment.
+ // - 'edist' is an approximation for the error rate within the exon
+ //
+ int flag;
+ int edist;
+
+ int percentID;
+ int alignmentLength;
+ int numMatches;
+ int numNs;
+ int numInDel;
+ int numEdits;
+#ifdef SPLSCORE
+ double splScore;
+#endif
+};
+
+
+
+
+
+
+// Just for allocating exons. It takes care of deleting exons. You
+// are not allowed to delete them.
+//
+class exonManager {
+
+public:
+ exonManager() {
+ _listLen = 1;
+ _listMax = 8;
+ _list = new Exon* [_listMax];
+
+ _curLen = 0;
+ _curMax = 256;
+ _list[0] = new Exon [_curMax];
+ };
+ ~exonManager() {
+ for (uint32 i=0; i<_listLen; i++)
+ delete [] _list[i];
+ delete [] _list;
+ };
+
+ Exon *newExon(int f1, int f2, int t1, int t2, int len, int edist, int flag, Exon *next) {
+
+ // If the current list is full, move to the next one, or allocate
+ // more space.
+ //
+ if (_curLen >= _curMax) {
+ if (_listLen >= _listMax) {
+ _listMax *= 2;
+ Exon **l = new Exon* [_listMax];
+ memcpy(l, _list, sizeof(Exon**) * _listLen);
+ delete [] _list;
+ _list = l;
+ }
+
+ _list[_listLen++] = new Exon [_curMax];
+ _curLen = 0;
+ }
+
+ Exon *e = _list[_listLen - 1] + _curLen;
+ _curLen++;
+ e->init(f1, f2, t1, t2, len, edist, flag, next);
+
+ //fprintf(stderr, "exonManager::newExon()-- return exon at %p\n", e);
+
+ return(e);;
+ };
+
+private:
+ // Exon pointers are valid throughout the lifetime of execution,
+ // so we can't use realloc here. Instead, we keep a list of arrays
+ // of exons.
+ //
+ uint32 _listLen;
+ uint32 _listMax;
+ Exon **_list;
+
+ uint32 _curLen;
+ uint32 _curMax;
+};
+
+
+#endif // EXON_H
diff --git a/libsim4/sim4core/exon_cores.C b/libsim4/sim4core/exon_cores.C
new file mode 100644
index 0000000..4698cb1
--- /dev/null
+++ b/libsim4/sim4core/exon_cores.C
@@ -0,0 +1,134 @@
+#include "sim4.H"
+#include <math.h>
+
+// exon_cores() must have seq-1 passed in. search() offsets this.
+
+void
+Sim4::exon_cores(char *s1,
+ char *s2,
+ int l1,
+ int l2,
+ int offset1,
+ int offset2,
+ int flag,
+ mss_t MSS,
+ int K,
+ int type) {
+
+ _mspManager.clear();
+ _mspManager.clearDiagonal(l1, l2);
+ _mspManager.setScoreThreshold(K, globalParams->_interspecies);
+
+//mss_t MSS = masks_shifts(seed); LLL DELETE
+
+ bld_table(s2,l2,MSS,type);
+ search(s1,s2,l1,l2,MSS);
+
+ // Cleaning up after the bld_table() is done at the next call, or
+ // in the destructor.
+ //
+ hashtable = 0L;
+
+ exon_list = _mspManager.doLinking(DEFAULT_WEIGHT, DEFAULT_DRANGE, offset1, offset2, flag, false, s1, s2);
+}
+
+
+
+
+void
+Sim4::search(char *s1, char *s2, int l1, int l2, mss_t MSS) {
+ struct hash_node *h;
+ char *t;
+ uint64 ecode;
+ int masked_ecode;
+ int i, p, j;
+
+ // Too short? Abort!
+ //
+ if (l1 < MSS.seedLength)
+ return;
+
+ t = s1+1;
+ i = 0;
+
+ int validEncoding = 1 - MSS.seedLength;
+ int pos1;
+
+ ecode = uint64ZERO;
+
+ // 5% win (tested on on small examples) if we use t[] instead of *t below.
+
+ // Scan from low to high position in the genomic sequence
+ //
+ if (MSS.type == CONTINUOUS_SEED) {
+ for (i=0; i < l1; i++) {
+ pos1 = (int)(t-s1) + i;
+
+ if (encoding[(int)t[i]] >= 0) {
+ validEncoding++;
+
+ ecode &= mask;
+ ecode <<= 2;
+ ecode |= encoding[(int)t[i]];
+ masked_ecode = (int)ecode;
+
+ if (validEncoding > 0) {
+ for (h = hashtable->table[masked_ecode & HASH_SIZE]; h; h = h->link) {
+ if (h->ecode == masked_ecode) {
+
+ // These positions are from high to low (see table.C)
+ //
+ for (p = h->pos; p >= 0; p = hashtable->nextPos[p])
+ _mspManager.addHit(s1, s2,
+ l1, l2,
+ pos1, p,
+ MSS);
+ break;
+ }
+ }
+ }
+ } else {
+ validEncoding = 1 - MSS.seedLength;
+ }
+ }
+ } else {
+ /* SPACED_SEED */
+ for (i=0; i < l1; i++) {
+ pos1 = (int)(t-s1) + i;
+
+ if (encoding[(int)t[i]] >= 0) {
+ validEncoding++;
+
+ ecode &= MSS.mask;
+ ecode <<= 2;
+ ecode |= encoding[(int)t[i]];
+
+#if 0
+ masked_ecode = mask_shift(ecode,MSS);
+#else
+ // 40% cheaper for cross-species, 53% cheaper for same species
+ for (j=masked_ecode=0; j<MSS.masknum; j++)
+ masked_ecode += (ecode & MSS.masks[j]) >> MSS.shifts[j];
+#endif
+
+ if (validEncoding > 0) {
+ for (h = hashtable->table[masked_ecode & HASH_SIZE]; h; h = h->link) {
+ if (h->ecode == masked_ecode) {
+
+ // These positions are from high to low (see table.C)
+ //
+ for (p = h->pos; p >= 0; p = hashtable->nextPos[p])
+ _mspManager.addHit(s1, s2,
+ l1, l2,
+ pos1, p,
+ MSS);
+ break;
+ }
+ }
+ }
+ } else {
+ validEncoding = 1 - MSS.seedLength;
+ }
+ }
+ }
+}
diff --git a/libsim4/sim4core/extend.C b/libsim4/sim4core/extend.C
new file mode 100644
index 0000000..28610dc
--- /dev/null
+++ b/libsim4/sim4core/extend.C
@@ -0,0 +1,331 @@
+#include "sim4.H"
+
+
+
+
+int
+Sim4::extend_bw(char *s1, char *s2, int m, int n, int offset1, int offset2, int *line1, int *line2)
+{
+ int col, /* column number */
+ row, /* row number */
+ max_d, /* bound on the length of the edit script
+ */
+ d, /* current compressed distance */
+ k, /* current diagonal */
+ DELTA, /* n-m */
+ ORIGIN,
+ lower,
+ upper,
+ magic_d;
+ int *last_d, *temp_d; /* column containing the last p */
+ int *min_row, *min_diag; /* min (b)/ max (f) row (and diagonal) */
+ /* reached for cost d=0, ... m. */
+ DELTA = n-m;
+ max_d = m+1;
+
+ ORIGIN = m;
+ for (row=m, col=n; row>0 && col>0 && (s1[row-1]==s2[col-1]); row--,col--)
+ /*LINTED empty loop body*/;
+
+ if ((row == 0) || (col == 0)) {
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ return 0;
+ }
+
+ int *allocdSpace = (int *)ckalloc((m+n+1+m+n+1+m+1+m+1) * sizeof(int));
+
+ last_d = allocdSpace; // m+n+1
+ temp_d = last_d + m+n+1; // m+n+1
+ min_row = temp_d + m+n+1; // m+1
+ min_diag = min_row + m+1; // m+1
+
+ for (k=0; k<=m+n; ++k)
+ last_d[k]=m+1;
+ last_d[ORIGIN+DELTA] = row;
+
+ lower = ORIGIN + DELTA - 1;
+ upper = ORIGIN + DELTA + 1;
+
+ for (d=1; d<=m; d++)
+ min_row[d] = m+1;
+
+ min_row[0] = last_d[ORIGIN+DELTA];
+ min_diag[0] = ORIGIN + DELTA;
+
+ d = 0;
+ while ((++d<=max_d) &&
+ ((d-1<=good_ratio(m-min_row[d-1])) ||
+ ((d>=2) && (d-2<=good_ratio(m-min_row[d-2]))))) {
+
+ /* for each relevant diagonal ... */
+ for (k = lower; k <= upper; k++) {
+
+ /* find a d on diagonal k */
+ if (k==-d+DELTA+ORIGIN) {
+ /* move down from the last d-1 on diagonal k+1 */
+ row = last_d[k+1];
+ /* op = INSERT; */
+
+ } else if (k==d+DELTA+ORIGIN) {
+ /* move right from the last d-1 on diagonal k-1 */
+ row = last_d[k-1]-1;
+ /* op = DELETE; */
+
+ } else if ((last_d[k]-1<=last_d[k+1]) &&
+ (last_d[k]-1<=last_d[k-1]-1)) {
+ /* substitution */
+ row = last_d[k]-1;
+ /* op = SUBSTITUTE; */
+
+ } else if ((last_d[k-1]-1<=last_d[k+1]) &&
+ (last_d[k-1]-1<=last_d[k]-1)) {
+ /* move right from the last d-1 on diagonal k-1 */
+ row = last_d[k-1]-1;
+ /* op = DELETE; */
+
+ } else {
+ /* move left from the last d-1 on diagonal k+1 */
+ row = last_d[k+1];
+ /* op = INSERT; */
+
+ }
+
+ /* code common to the three cases */
+ /* slide down the diagonal */
+
+ col = row+k-ORIGIN;
+
+ while ((row > 0) && (col > 0) && (s1[row-1]==s2[col-1]))
+ { row--; col--; }
+
+ temp_d[k] = row;
+
+ if ((row == 0) && (col == 0)) {
+ /* hit southeast corner; have the answer */
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ ckfree(allocdSpace);
+
+ return d;
+ }
+ if (row == 0) {
+ /* hit first row; don't look further */
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ ckfree(allocdSpace);
+
+ return d;
+ }
+
+ if (col == 0) {
+ /* hit last column; don't look further */
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ ckfree(allocdSpace);
+
+ return d;
+ }
+ }
+
+ min_row[d] = last_d[ORIGIN+DELTA];
+ min_diag[d] = ORIGIN+DELTA;
+ for (k=lower; k<=upper; ++k)
+ if (temp_d[k]<min_row[d]) {
+ min_row[d] = temp_d[k];
+ min_diag[d] = k;
+ }
+
+ for (k=lower; k<=upper; k++) {
+ last_d[k] = temp_d[k];
+ }
+
+ --lower;
+ ++upper;
+ }
+
+ /* report here the previous maximal match, stored in min_diag and min_row */
+ magic_d = (globalParams->_interspecies ? 2 : 3);
+ while ((d>0) && (min_row[d-1]-min_row[d]<magic_d))
+ d--;
+
+ *line1 = min_row[d]+offset1;
+ *line2 = min_row[d]+min_diag[d]-ORIGIN+offset2;
+
+ ckfree(allocdSpace);
+
+ return d;
+}
+
+
+int
+Sim4::extend_fw(char *s1, char *s2, int m, int n, int offset1, int offset2, int *line1, int *line2)
+{
+ int col, /* column number */
+ row, /* row number */
+ max_d, /* bound on the length of the edit script
+ */
+ d, /* current compressed distance */
+ k, /* current diagonal */
+ ORIGIN,
+ lower,
+ upper,
+ magic_d;
+ int *last_d, *temp_d; /* column containing the last p */
+ int *max_row, *max_diag; /* min (b)/ max (f) row (and diagonal) */
+ /* reached for cost d=0, ... m. */
+ max_d = m+1;
+
+ ORIGIN = m;
+ for (row=0, col=0; col<n && row<m && (s1[row]==s2[col]); row++, col++)
+ /*LINTED empty loop body*/;
+
+ if (row == m) {
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ return 0;
+ }
+ if (col == n) {
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ return 0;
+ }
+
+ int *allocdSpace = (int *)ckalloc((m+n+1+m+n+1+m+1+m+1) * sizeof(int));
+
+ last_d = allocdSpace; // m+n+1
+ temp_d = last_d + m+n+1; // m+n+1
+ max_row = temp_d + m+n+1; // m+1
+ max_diag = max_row + m+1; // m+1
+
+ for (k=0; k<=m+n; ++k) last_d[k]=-1;
+ last_d[ORIGIN] = row;
+
+ lower = ORIGIN - 1;
+ upper = ORIGIN + 1;
+
+ for (d=1; d<=m; d++)
+ max_row[d] = -1;
+
+ max_row[0] = last_d[ORIGIN];
+ max_diag[0] = ORIGIN;
+
+ d = 0;
+ while ((++d<=max_d) &&
+ ((d-1<=good_ratio(max_row[d-1])) ||
+ ((d>=2) && (d-2<=good_ratio(max_row[d-2]))))) {
+
+ /* for each relevant diagonal ... */
+ for (k = lower; k <= upper; k++) {
+
+ /* find a d on diagonal k */
+ if (k==-d+ORIGIN) {
+
+ /* move down from the last d-1 on diagonal k+1 */
+ row = last_d[k+1]+1;
+ /* op = DELETE; */
+ } else if (k==d+ORIGIN) {
+
+ /* move right from the last d-1 on diagonal k-1 */
+ row = last_d[k-1];
+ /* op = INSERT; */
+ } else if ((last_d[k]>=last_d[k+1]) &&
+ (last_d[k]+1>=last_d[k-1])) {
+
+ /* substitution */
+ row = last_d[k]+1;
+ /* op = SUBSTITUTE; */
+ } else if ((last_d[k+1]+1>=last_d[k-1]) &&
+ (last_d[k+1]>=last_d[k])) {
+
+ /* move down from the last d-1 on diagonal k+1 */
+ row = last_d[k+1]+1;
+ /* op = DELETE; */
+ } else {
+
+ /* move right from the last d-1 on diagonal k-1 */
+ row = last_d[k-1];
+ /* op = INSERT; */
+ }
+
+ /* code common to the three cases */
+ /* slide down the diagonal */
+
+ col = row+k-ORIGIN;
+
+ if (row>=0)
+ while ((row < m) && (col < n) && (s1[row]==s2[col]))
+ { row++; col++; }
+
+ temp_d[k] = row;
+
+ if ((row == m) && (col == n)) {
+ /* hit southeast corner; have the answer */
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ ckfree(allocdSpace);
+
+ return d;
+ }
+ if (row == m) {
+ /* hit last row; don't look further */
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ ckfree(allocdSpace);
+
+ return d;
+ }
+
+ if (col == n) {
+ /* hit last column; don't look further */
+
+ *line1 = row+offset1;
+ *line2 = col+offset2;
+
+ ckfree(allocdSpace);
+
+ return d;
+ }
+ }
+ max_row[d] = last_d[ORIGIN];
+ max_diag[d] = ORIGIN;
+ for (k=lower; k<=upper; ++k)
+ if (temp_d[k]>max_row[d]) {
+ max_row[d] = temp_d[k];
+ max_diag[d] = k;
+ }
+
+ for (k=lower; k<=upper; k++) {
+ last_d[k] = temp_d[k];
+ }
+
+ --lower;
+ ++upper;
+ }
+
+ /* report here the previous maximal match, stored in max_diag and max_row */
+
+ magic_d = (globalParams->_interspecies ? 2 : 3);
+ while ((d>0) && (max_row[d]-max_row[d-1]<magic_d))
+ d--;
+
+ *line1 = max_row[d]+offset1;
+ *line2 = max_row[d]+max_diag[d]-ORIGIN+offset2;
+
+ ckfree(allocdSpace);
+
+ return d;
+}
diff --git a/libsim4/sim4core/glimmerSplice.C b/libsim4/sim4core/glimmerSplice.C
new file mode 100644
index 0000000..201e184
--- /dev/null
+++ b/libsim4/sim4core/glimmerSplice.C
@@ -0,0 +1,491 @@
+#include "sim4.H"
+
+
+const char ALPHA_STRING [] = "acgt";
+const int DEFAULT_PERIODICITY = 3;
+const int DEFAULT_MODEL_DEPTH = 7;
+
+const int DEFAULT_MODEL_LEN = 12;
+const int ALPHABETSIZE = 4;
+const int MAX_ERROR_MSG_LEN = 1000;
+const int ICM_VERSION_ID = 200;
+const unsigned int NUM_FIXED_LENGTH_PARAMS = 6;
+
+const int ID_STRING_LEN = 400;
+//const unsigned NUM_FIXED_LENGTH_PARAMS = 6;
+
+#define PARENT(x) ((int) ((x) - 1) / ALPHABETSIZE)
+
+int Filter(char Ch);
+int Subscript(char ch);
+void Permute_String(char * s, int * perm, int n);
+void *Safe_malloc(size_t len, const char * src_fname, size_t line_num);
+void *Safe_realloc(void * q, size_t len, const char * src_fname, size_t line_num);
+void *Safe_calloc(size_t n, size_t len, const char * src_fname, size_t line_num);
+
+int Int_Power(int a, int b);
+void Input(struct ICM_t *p, FILE *fp, int model_len,int model_depth, int periodicity);
+int Get_Model_Depth (struct ICM_t p) { return p.model_depth; }
+int Get_Model_Len (struct ICM_t p) { return p.model_len; }
+int Get_Periodicity (struct ICM_t p) { return p.periodicity; }
+double Full_Window_Prob(struct ICM_t p, const char * string, int frame);
+
+int Get_Length(struct Fixed_Length_ICM_t fixed){ return fixed.length; }
+double Score_Window (struct Fixed_Length_ICM_t fixed, char * w, int left);
+int getModelLength(struct Fixed_Length_ICM_t fixed) { return fixed.length;}
+int getModelType(struct Fixed_Length_ICM_t fixed) { return fixed.model_type;}
+int getSpecialPosition(struct Fixed_Length_ICM_t fixed){ return fixed.special_position; }
+
+void readModel(struct Fixed_Length_ICM_t *fixed, const char *path);
+
+void readModel(struct Fixed_Length_ICM_t *fixed, const char *path)
+ {
+ FILE * fp;
+ char line [ID_STRING_LEN];
+ int param [NUM_FIXED_LENGTH_PARAMS];
+ int i;
+
+ if ((fp = fopen (path, "r"))==NULL) {
+ fprintf(stderr, "Error: Could not open Glimmer model file for reading (%s).\n", path);
+ exit(1);
+ }
+
+ fread (line, sizeof (char), ID_STRING_LEN, fp); // skip the text header line
+
+ if (fread (param, sizeof (int), NUM_FIXED_LENGTH_PARAMS, fp)
+ != NUM_FIXED_LENGTH_PARAMS)
+ {
+ fprintf (stderr, "ERROR reading file \"%s\"\n", path);
+ exit (-1);
+ }
+
+ if (ICM_VERSION_ID != param [0])
+ {
+ fprintf (stderr, "Bad ICM version = %d should be %d\n",
+ param [0], ICM_VERSION_ID);
+ exit (-1);
+ }
+ if (ID_STRING_LEN != param [1])
+ {
+ fprintf (stderr, "Bad ID_STRING_LEN = %d should be %d\n",
+ param [1], ID_STRING_LEN);
+ exit (-1);
+ }
+ (*fixed).length = param [2];
+ (*fixed).max_depth = param [3];
+ (*fixed).special_position = param [4];
+ (*fixed).model_type = param [5];
+
+ (*fixed).permutation = (int *) Safe_malloc((*fixed).length*sizeof(int), __FILE__,__LINE__);
+
+ for(i=0;i<(*fixed).length;i++) {
+ (*fixed).permutation[i] = 0;
+ }
+ fread ((*fixed).permutation, sizeof (int), (*fixed).length, fp);
+
+ (*fixed).sub_model = (struct ICM_t *) Safe_malloc
+ ((*fixed).length * sizeof (struct ICM_t ), __FILE__, __LINE__);
+
+ for (i = 0; i < (*fixed).length; i ++)
+ {
+ (*fixed).sub_model[i].score = (struct ICM_Score_Node_t * *)
+ Safe_calloc (1, sizeof (struct ICM_Score_Node_t *), __FILE__, __LINE__);
+// for (j = 0; j < 1; j ++) {
+// (*fixed).sub_model[i].score[j] = (struct ICM_Score_Node_t *)
+// Safe_calloc (12, sizeof (struct ICM_Score_Node_t),
+// __FILE__, __LINE__);
+// for(k=0;k<4;k++) {
+// (*fixed).sub_model[i].score[j][k].prob = (float *)
+// Safe_calloc (4, sizeof(float), __FILE__, __LINE__);
+// }
+// }
+//
+ }
+
+ for (i = 0; i < (*fixed).length; i ++)
+ {
+ Input(&((*fixed).sub_model[i]), fp,1,0,1);
+ }
+}
+
+// Input the contents of this model from fp , which has already been opened.
+
+void Input(struct ICM_t *p, FILE *fp, int model_len,int model_depth, int periodicity)
+{
+ char line [ID_STRING_LEN];
+ int param [NUM_FIXED_LENGTH_PARAMS];
+ int node_id;
+ int prev_node;
+ int period;
+ int i,j;
+ (*p).model_len = model_len;
+ (*p).model_depth = model_depth;
+ (*p).periodicity = periodicity;
+
+ (*p).empty = 1;
+
+ // skip the text header line
+ if (fread (line, sizeof (char), ID_STRING_LEN, fp) != (unsigned) (ID_STRING_LEN))
+ {
+ fprintf (stderr, "ERROR reading ICM header\n");
+ exit (-1);
+ }
+
+ if (fread (param, sizeof (int), NUM_FIXED_LENGTH_PARAMS, fp) != NUM_FIXED_LENGTH_PARAMS)
+ {
+ fprintf (stderr, "ERROR reading parameters\n");
+ exit (-1);
+ }
+
+ if (ICM_VERSION_ID != param [0])
+ {
+ fprintf (stderr, "Bad ICM version = %d should be %d\n", param [0], ICM_VERSION_ID);
+ exit (-1);
+ }
+ if (ID_STRING_LEN != param [1])
+ {
+ fprintf (stderr, "Bad ID_STRING_LEN = %d should be %d\n",
+ param [1], ID_STRING_LEN);
+ exit (-1);
+ }
+
+ (*p).model_len = param [2];
+ (*p).model_depth = param [3];
+ (*p).periodicity = param [4];
+ (*p).num_nodes = param [5];
+
+
+ (*p).score = (struct ICM_Score_Node_t **) Safe_malloc
+ ((*p).periodicity * sizeof (struct ICM_Score_Node_t *), __FILE__, __LINE__);
+ for (i = 0; i < (*p).periodicity; i ++) {
+ (*p).score [i] = (struct ICM_Score_Node_t *) Safe_calloc
+ ((*p).num_nodes, sizeof (struct ICM_Score_Node_t), __FILE__, __LINE__);
+ for(j=0;j<(*p).num_nodes;j++) {
+ (*p).score[i][j].prob = (float *) Safe_malloc(ALPHABETSIZE*sizeof(float), __FILE__, __LINE__);
+ }
+ }
+
+
+ period = -1;
+ prev_node = 0;
+ while (fread (& node_id, sizeof (int), 1, fp) != 0)
+ {
+ if (node_id < 0) break;
+ if (node_id == 0) period++;
+
+ // read in the probabilities
+ if (fread ((*p).score [period] [node_id] . prob, sizeof (float), ALPHABETSIZE, fp) !=
+ (unsigned) (ALPHABETSIZE))
+ {
+ fprintf (stderr, "ERROR reading icm node = %d period = %d\n", node_id, period);
+ exit (-1);
+ }
+
+
+ // read in the max mutual information position
+ if (fread (& ((*p).score [period] [node_id] . mut_info_pos), sizeof (short int), 1, fp) != 1)
+ {
+ fprintf (stderr, "ERROR reading mut_info_pos for node = %d period = %d\n", node_id, period);
+ exit (-1);
+ }
+
+ // check for cut nodes
+ if (node_id != 0 && prev_node != node_id - 1)
+ for (i = prev_node + 1; i < node_id; i ++)
+ (*p).score [period] [i] . mut_info_pos = -2;
+
+ if (node_id == 0 && period > 0)
+ for (i = prev_node + 1; i < (*p).num_nodes; i ++)
+ (*p).score [period - 1] [i] . mut_info_pos = -2;
+
+ prev_node = node_id;
+ }
+
+ if (period != periodicity - 1)
+ {
+ fprintf (stderr, "ERROR: Too few nodes for periodicity = %d\n", periodicity);
+ exit (-1);
+ }
+
+ // check for cut nodes in last period
+ if (prev_node != (*p).num_nodes - 1)
+ for (i = prev_node + 1; i < (*p).num_nodes; i ++)
+ (*p).score [period] [i] . mut_info_pos = -2;
+
+ (*p).empty = 0;
+}
+
+// Rearrange the characters in s according
+// to the permutation in perm .
+
+void Permute_String(char * s, int * perm, int n)
+ {
+ static char * buff = NULL;
+ static int buff_len = 0;
+ int i;
+
+ if (n > buff_len)
+ {
+ buff = (char *) Safe_realloc (buff, n, __FILE__, __LINE__);
+ buff_len = n;
+ }
+
+ for (i = 0; i < n; i ++)
+ buff [i] = s [perm [i]];
+ strncpy (s, buff, n);
+
+ return;
+ }
+
+// Return a single a, c, g or t for Ch .
+
+int Filter(char Ch)
+ {
+ switch (tolower (Ch))
+ {
+ case 'a' :
+ case 'c' :
+ case 'g' :
+ case 't' :
+ return Ch;
+ case 'r' : // a or g
+ return 'g';
+ case 'y' : // c or t
+ return 'c';
+ case 's' : // c or g
+ return 'c';
+ case 'w' : // a or t
+ return 't';
+ case 'm' : // a or c
+ return 'c';
+ case 'k' : // g or t
+ return 't';
+ case 'b' : // c, g or t
+ return 'c';
+ case 'd' : // a, g or t
+ return 'g';
+ case 'h' : // a, c or t
+ return 'c';
+ case 'v' : // a, c or g
+ return 'c';
+ default : // anything
+ return 'c';
+ }
+ }
+
+// Return the subscript equivalent (used in offsets of the
+// model) for character ch .
+
+int Subscript(char ch)
+ {
+ char * p;
+
+ p = strchr ((char *)ALPHA_STRING, tolower (Filter (ch)));
+ if (p == NULL)
+ {
+ fprintf (stderr, "ERROR: Bad character %c in subscript conversion",
+ ch);
+ exit (-1);
+ }
+
+ return (int) (p - ALPHA_STRING);
+ }
+
+
+// Return the log-probability of the last character in the first
+// model_len bases of string conditioned on the preceding characters
+// using the entries in score [frame] .
+
+double Full_Window_Prob (struct ICM_t icm, const char * string, int frame)
+ {
+ double prob;
+ int num_node, i, pos, sub;
+
+ num_node = 0;
+
+ for (i = 0; i < icm.model_depth; i ++)
+ {
+ pos = icm.score [frame] [num_node] . mut_info_pos;
+
+ if (pos == -1)
+ break;
+
+ if (pos < -1) // No information here or below in tree, go back up
+ // Shouldn't happen
+ {
+ num_node = PARENT (num_node);
+ pos = icm.score [frame] [num_node] . mut_info_pos;
+ break;
+ }
+
+ sub = Subscript (string [pos]);
+
+ num_node = (num_node * ALPHABETSIZE) + sub + 1;
+ }
+
+ pos = icm.score [frame] [num_node] . mut_info_pos;
+ if (pos < -1)
+ {
+ num_node = PARENT (num_node);
+ pos = icm.score [frame] [num_node] . mut_info_pos;
+ }
+
+ sub = Subscript (string [icm.model_len - 1]);
+
+ prob = (double) icm.score [frame] [num_node] . prob [sub];
+
+ if (pos < -1)
+ {
+ fprintf (stderr, "WARNING: prob = %.4f pos = %d in Full_Window_Prob\n",
+ prob, pos);
+ fprintf (stderr, "num_node = %d\n",
+ num_node);
+ }
+
+ return prob;
+ }
+
+
+// Return the score of this model on string w
+double Score_Window (struct Fixed_Length_ICM_t fixed, char * w, int left)
+{
+ static char * buff = NULL;
+ static int buff_len = 0;
+ double score = 0.0;
+ int i;
+
+ if (fixed.length > buff_len)
+ {
+ buff = (char *) Safe_realloc (buff, fixed.length+1, __FILE__, __LINE__);
+ buff_len = fixed.length;
+ }
+
+ strncpy (buff, w, fixed.length);
+// strncpy (buff, w, left);
+// strncpy (buff+left, w+left+2, fixed.length-left);
+
+ if (fixed.permutation != NULL)
+ Permute_String (buff, fixed.permutation, fixed.length);
+
+ for (i = 0; i < fixed.length; i ++)
+ {
+ if (buff [i] == '\0')
+ {
+ fprintf (stderr, "ERROR: String \"%s\" too short in Score_Window\n",
+ buff);
+ exit (-1);
+ }
+ score += Full_Window_Prob (fixed.sub_model[i], buff, 0);
+
+ }
+
+ return score;
+}
+
+void Clean_Exit
+ (const char * msg, const char * src_fname, size_t line_num)
+
+// Write string msg to stderr and also a line indicating
+// the error happen in source file src_fname at line line_num
+// if they are not NULL and 0 respectively.
+// Then exit with an error condition.
+
+ {
+ fprintf (stderr, "%s\n", msg);
+ if (src_fname != NULL)
+ fprintf (stderr, " in file %s", src_fname);
+ if (line_num != 0)
+ fprintf (stderr, " at line %lu", (long unsigned) (line_num));
+ fprintf (stderr, " errno = %d\n", errno);
+
+ exit (-1);
+ }
+
+void * Safe_calloc
+ (size_t n, size_t len, const char * src_fname, size_t line_num)
+
+// Allocate and return a pointer to enough memory to hold an
+// array with n entries of len bytes each. All memory is
+// cleared to 0. If fail, print a message and exit, assuming the
+// call came from source file src_fname at line line_num .
+
+ {
+ void * p;
+ char Clean_Exit_Msg_Line [MAX_ERROR_MSG_LEN];
+
+ p = calloc (n, len);
+ if (p == NULL)
+ {
+ sprintf (Clean_Exit_Msg_Line,
+ "ERROR: calloc failed %lu x %lu",
+ (long unsigned) (n), (long unsigned) (len));
+ Clean_Exit (Clean_Exit_Msg_Line, src_fname, line_num);
+ }
+
+ return p;
+ }
+
+
+void * Safe_malloc
+ (size_t len, const char * src_fname, size_t line_num)
+
+// Allocate and return a pointer to len bytes of memory.
+// If fail, print a message and exit, assuming the call came from
+// source file src_fname at line line_num .
+
+ {
+ void * p;
+ char Clean_Exit_Msg_Line [MAX_ERROR_MSG_LEN];
+
+ p = malloc (len);
+ if (p == NULL)
+ {
+ sprintf (Clean_Exit_Msg_Line,
+ "ERROR: malloc failed %lu bytes",
+ (long unsigned) (len));
+ Clean_Exit (Clean_Exit_Msg_Line, src_fname, line_num);
+ }
+
+ return p;
+ }
+
+void * Safe_realloc
+ (void * q, size_t len, const char * src_fname, size_t line_num)
+
+// Reallocate memory for q to len bytes and return a
+// pointer to the new memory. If fail, print a message and exit,
+// assuming the call came from source file src_fname at line line_num .
+
+ {
+ char Clean_Exit_Msg_Line [MAX_ERROR_MSG_LEN];
+
+ void * p;
+ p = realloc (q, len);
+ if (p == NULL)
+ {
+ sprintf (Clean_Exit_Msg_Line,
+ "ERROR: realloc failed %lu bytes",
+ (long unsigned) (len));
+ Clean_Exit (Clean_Exit_Msg_Line, src_fname, line_num);
+ }
+
+ return p;
+ }
+
+
+int Int_Power(int a, int b)
+{
+ int result = 1;
+ int p = a;
+
+ while (b > 0)
+ {
+ if (b & 1)
+ result *= p;
+ p = p * p;
+ b >>= 1;
+ }
+
+ return result;
+}
+
diff --git a/libsim4/sim4core/glimmerSplice.H b/libsim4/sim4core/glimmerSplice.H
new file mode 100644
index 0000000..d4a0a16
--- /dev/null
+++ b/libsim4/sim4core/glimmerSplice.H
@@ -0,0 +1,36 @@
+#ifndef GLIMMER_SPLICE_H
+#define GLIMMER_SPLICE_H
+
+struct ICM_Score_Node_t
+ {
+ short int mut_info_pos;
+ float mut_info;
+ float *prob; // was prob[ALPHABETSIZE];
+ };
+
+struct ICM_t
+{
+ int empty;
+ int model_len;
+ int model_depth;
+ int periodicity;
+ int num_nodes;
+ struct ICM_Score_Node_t **score;
+};
+
+struct Fixed_Length_ICM_t
+{
+ int length;
+ int max_depth;
+ int special_position;
+ int model_type;
+ int * permutation;
+ struct ICM_t *sub_model;
+};
+
+extern void readModel(struct Fixed_Length_ICM_t *fixed, const char *path);
+extern double Score_Window (struct Fixed_Length_ICM_t fixed, char * w, int left);
+extern int getModelLength(struct Fixed_Length_ICM_t fixed);
+
+#endif /* GLIMMER_SPLICE_H */
+
diff --git a/libsim4/sim4core/greedy.C b/libsim4/sim4core/greedy.C
new file mode 100644
index 0000000..ff1a2f8
--- /dev/null
+++ b/libsim4/sim4core/greedy.C
@@ -0,0 +1,358 @@
+#include "sim4.H"
+
+//#define ANNOUNCEEXIT(S) fprintf(stdout, S);
+#define ANNOUNCEEXIT(S)
+
+int
+Sim4::greedy(char *s1, char *s2, int m, int n0, int OFFSET1, int OFFSET2, Exon **lblock, Exon **rblock)
+{
+ int col, /* column number */
+ d, /* current distance */
+ k, /* current diagonal */
+ Cost,
+ blower,flower, /* boundaries for searching diagonals */
+ bupper,fupper,
+ row; /* row number */
+ int flip = 0; /* swap sequences for narrow gaps with interspecies */
+ int max_d; /* bound on size of edit script */
+ int back, forth; /* backward and forward limits at exit */
+
+ int *blast_d, *flast_d; /* rows containing the last d (at crt step, d-1) */
+ int *btemp_d, *ftemp_d; /* rows containing tmp values for the last d */
+ int *min_row, *min_diag; /* min (b)/ max (f) row (and diagonal) */
+ int *max_row, *max_diag; /* reached for cost d=0, ... m. */
+
+ const int MAX_D = max_d = MAX(wordSize,(int)(globalParams->_percentError * m + 1));
+
+ if (n0 < m) {
+ if (m < (int)MIN(wordSize, (1 + globalParams->_percentError) * n0)) {
+ *lblock = *rblock = _exonManager.newExon(OFFSET2+1,OFFSET1+1,OFFSET2+n0,OFFSET1+m,
+ m,n0-m+(int)(globalParams->_percentError * m + 1),0,NULL);
+ ANNOUNCEEXIT("greedy-1\n");
+ return(m-n0+(int)(globalParams->_percentError * n0 + 1));
+ } else if (m > (int)MIN(wordSize, (1 + globalParams->_percentError) * n0)) {
+ if (globalParams->_interspecies) {
+ /* flip coordinates */
+ d = m; m = n0; n0 = d;
+ d = OFFSET1; OFFSET1 = OFFSET2; OFFSET2 = d;
+ char *s = s1; s1 = s2; s2 = s;
+
+ flip = 1;
+ } else {
+ *lblock = *rblock = 0L;
+ ANNOUNCEEXIT("greedy-2\n");
+ return(MAX_D+1);
+ }
+ }
+ }
+
+ const int n1 = MIN(m+max_d+1, n0);
+ const int n2 = n1;
+ const int DELTA = n2-m;
+
+ const int l_offset1 = OFFSET1;
+ const int r_offset1 = OFFSET1;
+ const int l_offset2 = OFFSET2;
+ const int r_offset2 = OFFSET2 + n0 - n2;
+
+ const int L_ORIGIN = MAX_D;
+ const int R_ORIGIN = MAX_D - DELTA;
+
+ const char *l_s1 = s1;
+ const char *r_s1 = s1;
+ const char *l_s2 = s2;
+ const char *r_s2 = s2 + n0 - n2;
+
+
+ for (row=m, col=n2; row>0 && col>0 && (r_s1[row-1]==r_s2[col-1]); row--,col--)
+ /*LINTED empty loop body*/;
+
+ if (row == 0) {
+ /* hit last row; stop search */
+ if (flip) {
+ d = m; m = n0; n0 = d;
+ d = OFFSET1; OFFSET1 = OFFSET2; OFFSET2 = d;
+ char *s = s1; s1 = s2; s2 = s;
+ }
+ *lblock = *rblock = _exonManager.newExon(r_offset2-m+n2+1,r_offset1+1,r_offset2+n2,
+ r_offset1+m,m,0,0,NULL);
+ ANNOUNCEEXIT("greedy-3\n");
+ return 0;
+ }
+
+
+ // Instead of doing eight calls to ckalloc, we do one, and dish out
+ // that in pieces.
+ //
+
+ int *allocdSpace = (int *)ckalloc((4*(MAX_D+n2+1) + 4*(MAX_D+1)) * sizeof(int));
+
+ blast_d = allocdSpace; // MAX_D+n2+1
+ btemp_d = blast_d + (MAX_D+n2+1); // MAX_D+n2+1
+ flast_d = btemp_d + (MAX_D+n2+1); // MAX_D+n2+1
+ ftemp_d = flast_d + (MAX_D+n2+1); // MAX_D+n2+1
+ max_row = ftemp_d + (MAX_D+n2+1); // MAX_D+1
+ min_row = max_row + (MAX_D+1); // MAX_D+1
+ max_diag = min_row + (MAX_D+1); // MAX_D+1
+ min_diag = max_diag + (MAX_D+1); // MAX_D+1
+
+
+ for (k=0; k<=MAX_D+n2; ++k) {
+ blast_d[k] = m+1;
+ btemp_d[k] = m+1;
+ }
+
+ blast_d[R_ORIGIN+DELTA] = row;
+
+ blower = R_ORIGIN + DELTA - 1;
+ bupper = R_ORIGIN + DELTA + 1;
+
+
+ for (row=0; row<n1 && row<m && (l_s1[row]==l_s2[row]); row++)
+ /*LINTED empty loop body*/;
+
+ if (row == m) {
+ /* hit last row; stop search */
+ if (flip) {
+ d = m; m = n0; n0 = d;
+ d = OFFSET1; OFFSET1 = OFFSET2; OFFSET2 = d;
+ char *s = s1; s1 = s2; s2 = s;
+ }
+ *lblock = *rblock = _exonManager.newExon(l_offset2+1,l_offset1+1,l_offset2+m,
+ l_offset1+m,m,0,0,NULL);
+ ckfree(allocdSpace);
+
+ ANNOUNCEEXIT("greedy-4\n");
+ return 0;
+ }
+
+ for (k=0; k<=MAX_D+n1; ++k) {
+ flast_d[k]=-1;
+ ftemp_d[k]=-1;
+ }
+ flast_d[L_ORIGIN] = row;
+
+ flower = L_ORIGIN - 1;
+ fupper = L_ORIGIN + 1;
+
+ for (d=1; d<=MAX_D; d++) {
+ min_row[d] = m+1;
+ max_row[d] = -1;
+ }
+ min_row[0] = blast_d[R_ORIGIN+DELTA];
+ min_diag[0] = R_ORIGIN+DELTA;
+ max_row[0] = flast_d[L_ORIGIN];
+ max_diag[0] = L_ORIGIN;
+
+ back = forth = -1;
+
+ d = 1;
+ while (d <= max_d) {
+
+ /* for each relevant diagonal ... */
+ for (k = blower; k <= bupper; k++) {
+ /* get space for the next edit instruction */
+
+ /* find a d on diagonal k */
+ if (k==-d+DELTA+R_ORIGIN) {
+
+ /* move left from the last d-1 on diagonal k+1 */
+ row = blast_d[k+1];
+ }
+ else if (k==d+DELTA+R_ORIGIN) {
+
+ /* move up from the last d-1 on diagonal k-1 */
+ row = blast_d[k-1]-1;
+ } else if ((blast_d[k]<=blast_d[k+1]) &&
+ (blast_d[k]-1<=blast_d[k-1])) {
+
+ /* substitution */
+ row = blast_d[k]-1;
+
+ } else if ((blast_d[k-1]<=blast_d[k+1]-1) &&
+ (blast_d[k-1]<=blast_d[k]-1)) {
+ /* move right from the last d-1 on diagonal k-1 */
+ row = blast_d[k-1]-1;
+ } else {
+ /* move left from the last d-1 on diagonal k+1 */
+ row = blast_d[k+1];
+ }
+ /* code common to the three cases */
+ col = row + k - R_ORIGIN;
+
+ /* slide up the diagonal */
+ while (row > 0 && col > 0 && (r_s1[row-1]==r_s2[col-1])) {
+ --row;
+ --col;
+ }
+ btemp_d[k] = row;
+
+#if 0
+ if (row == 0 || col == 0)
+ max_d = d;
+#endif
+ } /* for k */
+
+ min_row[d] = btemp_d[DELTA+R_ORIGIN];
+ min_diag[d] = DELTA+R_ORIGIN;
+ for (k=blower; k<=bupper; ++k) {
+ blast_d[k] = btemp_d[k]; btemp_d[k] = m+1;
+ if (blast_d[k]<min_row[d]) {
+ min_row[d] = blast_d[k];
+ min_diag[d] = k;
+ }
+ }
+
+ /* record cell, if paths overlap with minimum combined cost */
+ /* obs: it suffices to search up to Cost=MIN(d-1,(max_d-d)) */
+ for (Cost=0; Cost<d; Cost++) {
+ if ((min_row[d]<=max_row[Cost]) &&
+ ((max_d > d+Cost) || (max_d==d+Cost && (forth<0)))) {
+ max_d = d+Cost;
+ back = d;
+ forth = Cost;
+ break;
+ }
+ }
+
+ --blower; ++bupper;
+
+ /* for each relevant diagonal ... */
+ for (k = flower; k <= fupper; k++) {
+ /* get space for the next edit instruction */
+
+ /* find a d on diagonal k */
+ if (k==-d+L_ORIGIN) {
+ /* move down from the last d-1 on diagonal k+1 */
+ row = flast_d[k+1]+1;
+
+ } else if (k==d+L_ORIGIN) {
+ /* move right from the last d-1 on diagonal k-1 */
+ row = flast_d[k-1];
+
+ } else if ((flast_d[k]>=flast_d[k+1]) &&
+ (flast_d[k]+1>=flast_d[k-1])) {
+
+ /* substitution */
+ row = flast_d[k]+1;
+
+ } else if ((flast_d[k+1]+1>=flast_d[k-1]) &&
+ (flast_d[k+1]>=flast_d[k])) {
+
+ /* move left from the last d-1 on diagonal k+1 */
+ row = flast_d[k+1]+1;
+ } else {
+ /* move right from the last d-1 on diagonal k-1 */
+ row = flast_d[k-1];
+ }
+ /* code common to the three cases */
+ col = row + k - L_ORIGIN;
+ /* slide down the diagonal */
+ if (row>=0)
+ while (row < m && col < n1 && (l_s1[row]==l_s2[col])) {
+ ++row;
+ ++col;
+ }
+ ftemp_d[k] = row;
+
+#if 0
+ if (row == m || col == n1)
+ max_d = d;
+#endif
+ } /* for k */
+
+ max_row[d] = ftemp_d[L_ORIGIN];
+ max_diag[d] = L_ORIGIN;
+ for (k=flower; k<=fupper; ++k) {
+ flast_d[k] = ftemp_d[k]; ftemp_d[k] = -1;
+ if (flast_d[k]>max_row[d]) {
+ max_row[d] = flast_d[k];
+ max_diag[d] = k;
+ }
+ }
+
+ /* record backward and forward limits, if minimum combined
+ * cost in overlapping. Note: it suffices to search up to
+ * Cost=MIN(d,(max_d-d)).
+ */
+ for (Cost=0; Cost<=d; Cost++) {
+ if ((min_row[Cost]<=max_row[d]) &&
+ ((max_d>d+Cost) || (max_d==d+Cost && (forth<0)))) {
+ max_d = d+Cost;
+ back = Cost;
+ forth = d;
+ break;
+ }
+ }
+ --flower;
+ ++fupper;
+
+ ++d; /* for d */
+ }
+
+ if (d>MAX_D) {
+ *lblock = *rblock = NULL;
+ ckfree(allocdSpace);
+ ANNOUNCEEXIT("greedy-5\n");
+ return d;
+ }
+
+
+ // XXX: Quick fix!
+ //
+ if ((back < 0) || (forth < 0)) {
+ *rblock = *lblock = 0L;
+ fprintf(stdout, "Choke!\n");
+ return(MAX_D+1);
+ }
+
+ if (flip) {
+ /* Cost is within allocated limit */
+ d = m; m = n0; n0 = d;
+ d = OFFSET1; OFFSET1 = OFFSET2; OFFSET2 = d;
+ char *s = s1; s1 = s2; s2 = s;
+ *lblock = *rblock = _exonManager.newExon(OFFSET2+1,OFFSET1+1,OFFSET2+n0,OFFSET1+m,m,back+forth,0,NULL);
+
+ ckfree(allocdSpace);
+ ANNOUNCEEXIT("greedy-6\n");
+
+ return back+forth;
+ }
+ if (m-min_row[back]>=max_row[forth]) {
+
+ if ((r_offset2+1+min_diag[back]-R_ORIGIN) <
+ (l_offset2+max_diag[forth]-L_ORIGIN)) {
+ *rblock = *lblock = _exonManager.newExon(l_offset2+1,l_offset1+1,
+ l_offset2+n0,l_offset1+m,
+ m,back+forth,0,NULL);
+ } else {
+ *rblock = _exonManager.newExon(r_offset2+1+min_row[back]+min_diag[back]-R_ORIGIN,
+ r_offset1+1+min_row[back],
+ r_offset2+n2,r_offset1+m,
+ m-min_row[back],back,0,NULL);
+ *lblock = _exonManager.newExon(l_offset2+1,l_offset1+1,
+ l_offset2+min_row[back]+max_diag[forth]-L_ORIGIN,
+ l_offset1+min_row[back],
+ min_row[back],forth,0,*rblock);
+ }
+ } else {
+ if ((r_offset2+1+min_diag[back]-R_ORIGIN) <
+ (l_offset2+max_diag[forth]-L_ORIGIN)) {
+ *rblock = *lblock = _exonManager.newExon(l_offset2+1,l_offset1+1,
+ l_offset2+n0,l_offset1+m,
+ m,back+forth,0,NULL);
+ } else {
+ *rblock = _exonManager.newExon(r_offset2+1+max_row[forth]+min_diag[back]-R_ORIGIN,
+ r_offset1+1+max_row[forth],
+ r_offset2+n2,r_offset1+m,m-max_row[forth],back,0,NULL);
+ *lblock = _exonManager.newExon(l_offset2+1,l_offset1+1,
+ l_offset2+max_row[forth]+max_diag[forth]-L_ORIGIN,
+ l_offset1+max_row[forth],max_row[forth],forth,0,*rblock);
+ }
+ }
+
+ ckfree(allocdSpace);
+
+ ANNOUNCEEXIT("greedy-7\n");
+ return back+forth;
+}
diff --git a/libsim4/sim4core/mspManager.C b/libsim4/sim4core/mspManager.C
new file mode 100644
index 0000000..aa6bf71
--- /dev/null
+++ b/libsim4/sim4core/mspManager.C
@@ -0,0 +1,628 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <math.h>
+#include <limits.h>
+#include "sim4.H"
+
+#define DEFAULT_L 8
+
+
+mspManager::mspManager() {
+ _sorted = true;
+
+ _ESTlen = 0;
+ _GENlen = 0;
+
+ _allocMSPs = 16384;
+ _numMSPs = 0;
+ _allMSPs = new msp [_allocMSPs];
+
+ // The following four variables are for aborting expensive
+ // polishes -- ones that have proven to be large chunks of
+ // genomic labeled as cDNA, and that have (ESTmapper) signals
+ // across entire scafflds.
+ //
+ _tooManyMSPs = false;
+ _cDNALength = 0;
+ _mspLimitPercent = 0.0;
+ _mspLimitAbsolute = 0;
+
+ // These need to be reset with setParameters. The code will die
+ // during link() if they are not set.
+ //
+ _match = 0;
+ _percentError = 0.0;
+ _imismatch = 0;
+ _vmismatch = 0;
+ _imatchdiff = 0;
+ _vmatchdiff = 0;
+
+ _wordExtAllow = 0;
+
+ _exonManager = 0L;
+
+ _minMSPScore = 0;
+
+ _diagMax = 0;
+ _diagExt = 0L;
+}
+
+
+mspManager::~mspManager() {
+ delete [] _allMSPs;
+ delete [] _diagExt;
+}
+
+
+
+static
+int
+get_edist(int f1, int f2,
+ int t1, int t2,
+ char *seq1,
+ char *seq2) {
+ char *s1, *s2, *q1, *q2;
+ int dist=0;
+
+ s1 = seq1+f1+1; /* bc at this stage, the msp pos do not have added +1 */
+ s2 = seq2+f2+1;
+ q1 = seq1+t1+1;
+ q2 = seq2+t2+1;
+
+ while (s1<=q1 && s2<=q2) {
+ dist += (*s1!=*s2);
+ s1++;
+ s2++;
+ }
+
+ return dist;
+}
+
+
+static
+int
+mspManager_msp_compare(const void *A, const void *B) {
+ msp const *a = (msp const *)A;
+ msp const *b = (msp const *)B;
+
+ if (a->pos2 < b->pos2)
+ return(-1);
+
+ if (a->pos2 > b->pos2)
+ return(1);
+
+ if (a->pos1 < b->pos1)
+ return(-1);
+
+ if (a->pos1 > b->pos1)
+ return(1);
+
+ return(0);
+}
+
+
+static
+int find_log_entry(const int *log4s, int n, int len, int offset)
+{
+ int a;
+
+ a = n/2;
+ if ((len<log4s[a]) && (!a || (len>=log4s[a-1])))
+ return MAX(0,(a-1))+offset;
+ else if ((len>=log4s[a]) && ((a==n-1) || (len<log4s[a+1])))
+ return MIN(n-1,(a+1))+offset;
+ else if (len<log4s[a])
+ return find_log_entry(log4s,a-1,len, offset);
+ else if (len>log4s[a])
+ return find_log_entry(log4s+a+1,n-a-1,len, offset+a+1);
+ return -1;
+}
+
+Exon*
+mspManager::doLinking(int weight,
+ int drange,
+ int offset1,
+ int offset2,
+ int flag,
+ int relinkFlag,
+ char *s1,
+ char *s2) {
+
+ // Ensure the MSP's are sorted
+ //
+ if (_sorted == false)
+ qsort(_allMSPs, _numMSPs, sizeof(struct msp), mspManager_msp_compare);
+
+ _sorted = true;
+
+ //
+ // Assumes the exon list is cleared
+ //
+
+ // If this ever occurs, you (the programmer) forgot to call
+ // mspManager::setParameters() with the correct values. Unless the
+ // code was really hacked, this should never occur. See
+ // Sim4::Sim4().
+ //
+ if ((_match == 0) &&
+ (_imatchdiff == 0) &&
+ (_vmatchdiff == 0) &&
+ (_percentError == 0.0)) {
+ fprintf(stderr, "sim4::link()-- ERROR; mspManager parameters not set! This is an algorithm error.\n");
+ exit(1);
+ }
+
+ // Check if this match looks suspiciously expensive
+ //
+ if ((_cDNALength > 0) &&
+ (_mspLimitAbsolute > 0) && (_mspLimitAbsolute < _numMSPs) &&
+ (_mspLimitPercent > 0.0) && (_mspLimitPercent * _cDNALength < _numMSPs)) {
+ _tooManyMSPs = true;
+ return(0L);
+ }
+
+
+
+ int f1, f2, best, diag, diff_diag, best_sc, tryval;
+
+ best = -1;
+ best_sc = INT_MIN;
+
+#if 0
+ for (uint32 i = 0; i < _numMSPs; ++i) {
+ fprintf(stderr, "LINK MSP %d -- %d-%d %d-%d score=%d,%d\n",
+ i,
+ _allMSPs[i].pos1, _allMSPs[i].pos1 + _allMSPs[i].len,
+ _allMSPs[i].pos2, _allMSPs[i].pos2 + _allMSPs[i].len,
+ _allMSPs[i].score, _allMSPs[i].linkingScore);
+ }
+#endif
+
+ for (uint32 i = 0; i < _numMSPs; ++i) {
+ f1 = _allMSPs[i].pos1; /* start position in seq1 */
+ f2 = _allMSPs[i].pos2; /* start position in seq2 */
+ diag = f1 - f2;
+ _allMSPs[i].prev = -1;
+ _allMSPs[i].linkingScore = 0;
+
+#ifdef SHOW_LINKING
+ fprintf(stderr, "link %d\r", i);
+ fflush(stderr);
+#endif
+
+ for (uint32 j = 0; j < i; ++j) {
+
+ // 12 == default word size. A Magic Value.
+ int WS = 12;
+
+ int vL = DEFAULT_L;
+ if ((_allMSPs[i].pos2 + _allMSPs[i].len - _allMSPs[j].pos2 - _allMSPs[j].len > 2 * WS) &&
+ (_allMSPs[i].pos2 - _allMSPs[j].pos2 > 2 * WS))
+ vL *= 2;
+
+ diff_diag = diag - _allMSPs[j].pos1 + _allMSPs[j].pos2;
+
+ // Abort if the difference is too big
+ //
+ if ((diff_diag < -drange) ||
+ ((diff_diag > drange) && (diff_diag < MIN_INTRON)) ||
+ (_allMSPs[j].pos2 + _allMSPs[j].len - 1 - f2 > vL) ||
+ (_allMSPs[j].pos1 + _allMSPs[j].len - 1 - f1 > vL))
+ continue;
+
+ int n = abs(diff_diag);
+ tryval = _allMSPs[j].linkingScore - n;
+ if (relinkFlag)
+ tryval = _allMSPs[j].linkingScore - ((n <= 100000) ? n : (100000+(int)(10*log((double)(n-100000)))));
+
+ if (tryval > _allMSPs[i].linkingScore) {
+ _allMSPs[i].linkingScore = tryval;
+ _allMSPs[i].prev = j;
+ }
+ }
+ _allMSPs[i].linkingScore += (weight * _allMSPs[i].score);
+ if (_allMSPs[i].linkingScore > best_sc) {
+ best = i;
+ best_sc = _allMSPs[i].linkingScore;
+ }
+ }
+
+ if (best < 0)
+ return(0L);
+
+ int last_msp = best;
+ int diag_dist;
+ int diff;
+
+ msp *mp = _allMSPs + last_msp;
+ Exon *elist = _exonManager->newExon(mp->pos1,
+ mp->pos2,
+ mp->pos1+mp->len-1,
+ mp->pos2+mp->len-1,
+ -1,
+ (mp->len * _match - mp->score) / _vmatchdiff,
+ 0,
+ 0L);
+
+ last_msp = mp->prev;
+
+ while (last_msp >= 0) {
+ mp = _allMSPs + last_msp;
+
+ int l1 = elist->frEST - elist->frGEN;
+ int l2 = mp->pos2 - mp->pos1;
+
+ if (l1 > l2)
+ diag_dist = l1 - l2;
+ else
+ diag_dist = l2 - l1;
+
+ if ((diag_dist <= DEFAULT_L) &&
+ (elist->frEST - (mp->pos2 + mp->len - 1)) < MAX_INTERNAL_GAP) {
+ /* merge with previous exon */
+ elist->edist += diag_dist;
+ elist->edist += (mp->len * _match - mp->score) / _vmatchdiff;
+ if ((diff=mp->pos2+mp->len-elist->frEST)>0) { /* overlap */
+ int dist1, dist2;
+ dist1 = get_edist(elist->frGEN,mp->pos2+mp->len-diff,
+ elist->frGEN+diff-1,mp->pos2+mp->len-1,s1,s2);
+ dist2 = get_edist(mp->pos1+mp->len-diff,mp->pos2+mp->len-diff,
+ mp->pos1+mp->len-1,mp->pos2+mp->len-1,s1,s2);
+ elist->edist -= MAX(dist1,dist2);
+ } else if (diff<0) { /* gap */
+ elist->edist += (int)(0.5 * _percentError * (-1) * diff);
+ }
+ elist->toGEN = MAX(elist->toGEN,mp->pos1+mp->len-1);
+ elist->toEST = MAX(elist->toEST,mp->pos2+mp->len-1);
+ elist->frGEN = MIN(elist->frGEN,mp->pos1);
+ elist->frEST = MIN(elist->frEST,mp->pos2);
+ } else {
+ elist = _exonManager->newExon(mp->pos1,
+ mp->pos2,
+ mp->pos1+mp->len-1,
+ mp->pos2+mp->len-1,
+ -1,
+ (mp->len * _match - mp->score) / _vmatchdiff,
+ 0,
+ elist);
+ }
+
+ last_msp = mp->prev;
+ }
+
+
+
+
+ // Fix them? What does this do??
+ //
+ Exon *tmp_block = elist;
+ while (tmp_block != 0L) {
+ tmp_block->length = tmp_block->toEST-tmp_block->frEST+1;
+ tmp_block->toGEN += offset1;
+ tmp_block->frGEN += offset1;
+ tmp_block->toEST += offset2;
+ tmp_block->frEST += offset2;
+ tmp_block->flag = flag;
+
+ tmp_block = tmp_block->next_exon;
+ }
+
+
+ return(elist);
+}
+
+
+
+
+
+
+
+
+// The log4 arrays were computed to mimick the behaviour of the log formula
+// for computing the msp threshold in exon_cores(). For genomic_log4s,
+// entry i stores the value for the length of a genomic sequence
+// for which the contribution to the msp threshold is i/2, i.e.:
+// 1.4*log_4(3/4*len1) = i/2;
+//
+// Similarly, cDNA_log4s entries store lengths of the cDNA sequence for which
+// the contribution to the msp threshold is i/2, i.e.:
+// 1.4*log_4(len2) = i/2;
+//
+// Both arrays are sorted in increasing order, and can be searched with
+// binary search.
+//
+#define GEN_LOG4_ENTRIES 45
+#define CDNA_LOG4_ENTRIES 25
+
+const int
+genomic_log4s[GEN_LOG4_ENTRIES]= {1, 2, 3, 5, 9, 15, 26, 42, 70, 114,
+ 188, 309, 507, 832, 1365, 1365, 2240, 2240, 3675, 6029,
+ 9892, 16231, 26629, 43690, 71681,
+ 117606, 192953, 316573, 519392, 852152,
+ 1398101, 2293823, 3763409, 6174516, 10130347,
+ 16620564, 27268873, 44739242, 73402365, 120429110,
+ 197584514, 324171126, 531858072, 872603963, 1431655765 };
+
+const int
+cDNA_log4s[CDNA_LOG4_ENTRIES]= {1, 1, 2, 4, 7, 11, 19, 32, 52, 86,
+ 141, 231, 380, 624, 1024, 1680, 2756, 4522, 7419, 12173,
+ 19972, 32768, 53761, 88204, 144715 };
+
+#if 0
+// The original used a binary search but with so few entries brute
+// force works better.
+// LLL 4/9/2009: does not return the same result as the original,
+// and gives false positive matches for interspecies comparisons;
+// restored original version
+//
+int
+get_msp_threshold(int len1, int len2) {
+ int i, j;
+
+ // Find the index of the largest value smaller than our lengths.
+ //
+ i = 0;
+ while (i<GEN_LOG4_ENTRIES) {
+ if (genomic_log4s[i] > len1)
+ break;
+ i++;
+ }
+ i--;
+
+ j = 0;
+ while (j<CDNA_LOG4_ENTRIES) {
+ if (cDNA_log4s[j] > len2)
+ break;
+ j++;
+ }
+ j--;
+
+ //
+ // XXX: This looks suspicious!
+ //
+
+ if ((i % 2) == 0)
+ return(i/2+j/2);
+
+ if ((j % 2) == 0)
+ return(i/2+j/2);
+
+ return(i/2+j/2+1);
+}
+#endif
+
+
+int get_msp_threshold(int len1, int len2)
+{
+ int i, j;
+
+ i = find_log_entry(genomic_log4s, GEN_LOG4_ENTRIES, len1, 0);
+ j = find_log_entry(cDNA_log4s, CDNA_LOG4_ENTRIES, len2, 0);
+
+ if (!(i % 2)) return (int)(i/2+j/2);
+ else if (!(j % 2)) return (int)(i/2+j/2);
+ else return (int)(i/2+j/2+1);
+}
+
+
+void
+mspManager::setScoreThreshold(int K, int interspecies) {
+
+ if (interspecies) {
+ if (K <= 0) {
+// _minMSPScore = (int)(((int)(log(.75*(double)_GENlen)+log((double)_ESTlen))/log(4.0)) * 1.0);
+ _minMSPScore = get_msp_threshold(_GENlen, _ESTlen);
+ } else {
+ _minMSPScore = K;
+ }
+ } else {
+ if (K <= 0) {
+ _minMSPScore = get_msp_threshold(_GENlen, _ESTlen);
+
+ // compensate for the rounding in the log formula
+ if (_minMSPScore >= 0)
+ _minMSPScore--;
+ } else {
+ _minMSPScore = K;
+ }
+ }
+}
+
+void
+mspManager::addHit_(char *genSeq, char *estSeq,
+ int genLen, int estLen,
+ int genPos, int estPos,
+ mss_t &MSS) {
+ char *genBeg = 0L;
+ char *estBeg = 0L;
+ char *genEnd = 0L;
+ char *genTmp = 0L;
+ char *estTmp = 0L;
+ int right_sum = 0;
+ int middle_sum = 0;
+ int left_sum = 0;
+ int sum = 0;
+ int score = 0;
+
+#ifdef DEBUG_EXTENSION
+ fprintf(stderr, "mspManager::addHit()-- extending hit from GEN %d to %d and EST %d to %d (length = %d)\n",
+ genPos-W, genPos, estPos-W, estPos, W);
+#endif
+
+#ifdef DEBUG_EXTENSION
+ {
+ char L[41], M[41], R[41];
+ int x;
+
+ if (genPos-MSS.seedLength > 20) genTmp = genSeq + 1 + genPos - MSS.seedLength - 20;
+ else genTmp = genSeq + 1;
+
+ x=0;
+ while (genTmp < genSeq + 1 + genPos - MSS.seedLength)
+ L[x++] = *genTmp++;
+ L[x] = 0;
+ x=0;
+ while (genTmp < genSeq + 1 + genPos)
+ M[x++] = *genTmp++;
+ M[x] = 0;
+ x=0;
+ while (genTmp < genSeq + 1 + genPos + 20)
+ R[x++] = *genTmp++;
+ R[x] = 0;
+ fprintf(stderr, "GEN=%8d %s:%s:%s\n", genPos, L, M, R);
+
+ if (estPos-MSS.seedLength > 20) estTmp = estSeq + 1 + estPos - MSS.seedLength - 20;
+ else estTmp = estSeq + 1;
+
+ x=0;
+ while (estTmp < estSeq + 1 + estPos - MSS.seedLength)
+ L[x++] = *estTmp++;
+ L[x] = 0;
+ x=0;
+ while (estTmp < estSeq + 1 + estPos)
+ M[x++] = *estTmp++;
+ M[x] = 0;
+ x=0;
+ while (estTmp < estSeq + 1 + estPos + 20)
+ R[x++] = *estTmp++;
+ R[x] = 0;
+ fprintf(stderr, "EST=%8d %s:%s:%s\n", estPos, L, M, R);
+ }
+#endif
+
+ // We use diagonals directly -- original version offset the array of
+ // diagonal positions by the constant value included below.
+
+ // Extend to the right
+ //
+ left_sum = 0;
+ sum = 0;
+ genTmp = genSeq + 1 + genPos;
+ estTmp = estSeq + 1 + estPos;
+ genEnd = genTmp;
+
+ while ((*genTmp) &&
+ (*estTmp) &&
+ (estTmp <= estSeq + estLen) &&
+ (genTmp <= genSeq + genLen) &&
+ (sum >= left_sum - _wordExtAllow)) {
+
+ sum += _match;
+ if (*estTmp != *genTmp)
+ sum -= (transitionFunction(*estTmp, *genTmp) ? _imatchdiff : _vmatchdiff);
+
+ estTmp++;
+ genTmp++;
+ if (sum > left_sum) {
+ left_sum = sum;
+ genEnd = genTmp;
+ }
+ }
+
+#ifdef TEST_SEEDS_IN_EXTENSION
+ // Check the bases that the seed supposedly matched
+ //
+ middle_sum = 0;
+ sum = 0;
+ genTmp = genSeq + 1 + genPos - 1;
+ estTmp = estSeq + 1 + estPos - 1;
+
+ for (int x=0; x<MSS.seedLength; x++) {
+ middle_sum += _match;
+ if (*genTmp != *estTmp)
+ middle_sum -= (transitionFunction(*estTmp, *genTmp) ? _imatchdiff : _vmatchdiff);
+
+ //fprintf(stderr, "%c %c\n", *genTmp, *estTmp);
+
+ estTmp--;
+ genTmp--;
+ }
+
+ if (middle_sum != (MSS.matchesLegth/2))) {
+ fprintf(stderr, "mspManager::addHit()-- ERROR: i didn't find an exact match for the seed you supplied!\n");
+ fprintf(stderr, "mspManager::addHit()-- ERROR: GEN=%40.40s\n", genTmp);
+ fprintf(stderr, "mspManager::addHit()-- ERROR: EST=%40.40s\n", estTmp);
+ exit(1);
+ }
+#endif
+
+ // Calculate the score of the seed match
+ //
+ middle_sum = 0;
+ sum = 0;
+ genTmp = genSeq + 1 + genPos - 1;
+ estTmp = estSeq + 1 + estPos - 1;
+
+ for (int x=0; x<MSS.seedLength; x++) {
+ if (*genTmp == *estTmp) middle_sum += _match;
+
+ estTmp--;
+ genTmp--;
+ }
+
+
+ // Extend to the left
+ //
+ right_sum = 0;
+ sum = 0;
+ genTmp = genSeq + 1 + genPos - MSS.seedLength;
+ estTmp = estSeq + 1 + estPos - MSS.seedLength;
+ genBeg = genTmp;
+ estBeg = estTmp;
+
+ while ((estTmp > estSeq + 1) &&
+ (genTmp > genSeq + 1) &&
+ (sum >= right_sum - _wordExtAllow)) {
+
+ estTmp--;
+ genTmp--;
+ sum += _match;
+ if (*estTmp != *genTmp)
+ sum -= (transitionFunction(*estTmp, *genTmp) ? _imatchdiff : _vmatchdiff);
+
+ if (sum > right_sum) {
+ right_sum = sum;
+ estBeg = estTmp;
+ genBeg = genTmp;
+ }
+ }
+
+ score = middle_sum + left_sum + right_sum;
+
+#ifdef DEBUG_MSPS
+
+ printf("TESTMSP: p1 = %7d p2 = %7d l = %7d sc = %7d (%d-%d-%d) ",
+ (int)(genBeg - (genSeq + 1)), (int)(estBeg - (estSeq + 1)), (int)(genEnd - genBeg),
+ score, left_sum, middle_sum, right_sum);
+ printf("g: ");
+ for (s=genBeg; s<genEnd; s++) printf("%c", *s);
+ printf(" c: ");
+ for (s=estBeg; s<estBeg+(int)(genEnd-genBeg); s++) printf("%c", *s);
+ printf(" S: %7d W: %7d cutoff: %d", MSS.seedLength, (int)(MSS.matchedLength/2), _minMSPScore);
+ printf("\n");
+
+#endif
+
+ // If this hit is significant, save it
+ //
+ if (score >= _minMSPScore)
+ addMSP((int)(genEnd - genBeg),
+ (int)(genBeg - (genSeq + 1)),
+ (int)(estBeg - (estSeq + 1)),
+ score);
+
+#ifdef DEBUG_EXTENSION
+ fprintf(stderr, "mspManager::addHit()-- added from GEN %d to %d and EST %d to ? (length = %d) with score %d (needed %d) l,m,r sums %d %d %d\n",
+ (int)(genBeg - (genSeq + 1)), (int)(genEnd - (genSeq + 1)) + W,
+ (int)(estBeg - (estSeq + 1)),
+ MSS.seedLength,
+ score, _minMSPScore, left_sum, middle_sum, right_sum);
+#endif
+
+ // Remember the highest point that this diagonal has been extended
+ // to. We use this to short circuit useless mer extensions (if
+ // we've already extended through it).
+ //
+ _diagExt[estLen + genPos - estPos - 1] = (int)(genEnd - genSeq - 1 + MSS.seedLength);
+}
diff --git a/libsim4/sim4core/mspManager.H b/libsim4/sim4core/mspManager.H
new file mode 100644
index 0000000..e954c6e
--- /dev/null
+++ b/libsim4/sim4core/mspManager.H
@@ -0,0 +1,237 @@
+#ifndef MSP_MANAGER_H
+#define MSP_MANAGER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include "util++.H"
+#include "exon.H"
+#include "sim4b1_s.H"
+
+struct msp {
+ int len;
+ int pos1;
+ int pos2;
+ int score;
+ int linkingScore;
+ int prev;
+};
+
+
+//
+// How to handle memory allocation?
+//
+// Just use an array of msp objects, reallocate
+// when needed. Allocate a large number of these
+// initially.
+//
+class mspManager {
+public:
+ mspManager();
+ ~mspManager();
+
+
+ // Returns true if x=a and y=g x=g and y=a
+ // x=c and y=t x=t and y=c
+ //
+ // This used to be an array of size [256][256], that was initialized on each construction of this
+ // object. That was killing performance in snapper.
+ //
+ int transitionFunction(int x, int y) {
+ int xa = ((x == 'a') || (x == 'A'));
+ int xc = ((x == 'c') || (x == 'C'));
+ int xg = ((x == 'g') || (x == 'G'));
+ int xt = ((x == 't') || (x == 'T'));
+
+ int ya = ((y == 'a') || (y == 'A'));
+ int yc = ((y == 'c') || (y == 'C'));
+ int yg = ((y == 'g') || (y == 'G'));
+ int yt = ((y == 't') || (y == 'T'));
+
+ return((xa && yg) ||
+ (xg && ya) ||
+ (xc && yt) ||
+ (xt && yc));
+ };
+
+ void setParameters(int match, int imismatch, int vmismatch, double percenterror, int wordextallow) {
+ _imismatch = imismatch;
+ _vmismatch = vmismatch;
+ _match = match;
+ _imatchdiff = match - imismatch;
+ _vmatchdiff = match - vmismatch;
+ _percentError = percenterror;
+ _wordExtAllow = wordextallow;
+ };
+
+ void setExonSource(exonManager *em) {
+ _exonManager = em;
+ };
+
+ void setLimits(uint32 a, double p) {
+ _mspLimitAbsolute = a;
+ _mspLimitPercent = p;
+ };
+
+ void setScoreThreshold(int K, int interspecies);
+
+ bool tooManyMSPs(void) { return(_tooManyMSPs); };
+ uint32 numberOfMSPs(void) { return(_numMSPs); };
+
+ void setLength(int l) {
+ _cDNALength = l;
+ };
+
+ void clear(void) { _numMSPs = 0; };
+
+ void clearDiagonal(int genlen, int estlen) {
+
+ // XXX: These aren't always the real EST and GENOMIC lengths. If
+ // we are working in a subsequence of the whole sequence they
+ // will be the length of the subsequence.
+ //
+ _GENlen = genlen;
+ _ESTlen = estlen;
+
+ // Allocate more space, if needed.
+ //
+ if (_GENlen + _ESTlen + 1 > _diagMax) {
+ delete _diagExt;
+
+ // Allocate space for the list of extension ends. Each diagonal
+ // remembers the lowest position that it has been extended to. This
+ // lets us throw out new hits without extending or merging in new
+ // extensions. Assumes that hits are added in order.
+ //
+ _diagMax = _GENlen + _ESTlen + 1;
+ _diagExt = new int [_diagMax];
+ }
+
+ // Using the obvious for loop for this hurts. Don't do it.
+ //
+ bzero(_diagExt, sizeof(int) * (_GENlen + _ESTlen + 1));
+ };
+
+ // add an extended MSP to the list
+ //
+ void addMSP(int l, int p1, int p2, int sc);
+
+ // add a single unextended hit to the list.
+ // this will do extensions if we haven't already extended through it.
+ //
+ void addHit(char *genSeq, char *estSeq,
+ int genLen, int estLen,
+ int genPos, int estPos,
+ mss_t &MSS) {
+
+#if 0
+ // We'd like to tie this into DEBUG_EXTENSION, but I want
+ // to keep those defines in the source file. Oh well.
+ //
+ fprintf(stderr, "mspManager::addHit()-- adding hit from GEN %d to %d and EST %d to %d (length = %d) diag=%d lim=%d have %d\n",
+ genPos-MSS.seedLength, genPos, estPos-MSS.seedLength, estPos, MSS.seedLength,
+ estLen + genPos - estPos - 1,
+ _diagExt[estLen + genPos - estPos - 1],
+ genPos);
+
+#endif
+
+ if (_diagExt[estLen + genPos - estPos - 1] <= genPos)
+ addHit_(genSeq, estSeq, genLen, estLen, genPos, estPos, MSS);
+ };
+
+
+ Exon *doLinking(int weight,
+ int drange,
+ int offset1,
+ int offset2,
+ int flag,
+ int relinkFlag,
+ char *s1,
+ char *s2);
+
+private:
+ void addHit_(char *genSeq, char *estSeq,
+ int genLen, int estLen,
+ int genPos, int estPos,
+ mss_t &MSS);
+
+ bool _sorted;
+
+ int _ESTlen;
+ int _GENlen;
+
+ uint32 _allocMSPs;
+ uint32 _numMSPs;
+ msp *_allMSPs;
+
+ bool _tooManyMSPs;
+ int _cDNALength;
+
+ double _mspLimitPercent;
+ uint32 _mspLimitAbsolute;
+
+ int _match;
+ int _imismatch;
+ int _vmismatch;
+ int _imatchdiff;
+ int _vmatchdiff;
+
+ double _percentError;
+ int _wordExtAllow;
+
+ exonManager *_exonManager;
+
+ int _minMSPScore;
+
+ int _diagMax;
+ int *_diagExt;
+};
+
+
+
+
+inline
+void
+mspManager::addMSP(int l, int p1, int p2, int sc) {
+
+ // Allocate more MSPs, if we need to.
+ //
+ if (_numMSPs >= _allocMSPs) {
+ _allocMSPs *= 2;
+
+ msp *n = new msp [_allocMSPs];
+
+ for (uint32 i=0; i<_numMSPs; i++) {
+ n[i].len = _allMSPs[i].len;
+ n[i].pos1 = _allMSPs[i].pos1;
+ n[i].pos2 = _allMSPs[i].pos2;
+ n[i].score = _allMSPs[i].score;
+ n[i].linkingScore = _allMSPs[i].linkingScore;
+ n[i].prev = _allMSPs[i].prev;
+ }
+
+ delete _allMSPs;
+ _allMSPs = n;
+ }
+
+#ifdef DEBUG_MSPS
+ fprintf(stdout, "ADDMSP: p1=%8d p2=%8d l=%8d sc=%8d\n",
+ p1, p2, l, sc);
+#endif
+
+ _allMSPs[_numMSPs].len = l;
+ _allMSPs[_numMSPs].pos1 = p1;
+ _allMSPs[_numMSPs].pos2 = p2;
+ _allMSPs[_numMSPs].score = sc;
+ _allMSPs[_numMSPs].linkingScore = 0;
+ _allMSPs[_numMSPs].prev = 0;
+
+ _numMSPs++;
+
+ _sorted = false;
+}
+
+
+#endif // MSP_MANAGER_H
diff --git a/libsim4/sim4core/pluri_align.C b/libsim4/sim4core/pluri_align.C
new file mode 100644
index 0000000..89fdf45
--- /dev/null
+++ b/libsim4/sim4core/pluri_align.C
@@ -0,0 +1,324 @@
+#include <math.h>
+#include "sim4.H"
+
+
+// Condense_both_Ends -- merge contiguous operations of the same type
+// together; return both new ends of the chain.
+//
+void
+Sim4::Condense_both_Ends(edit_script **head,
+ edit_script **tail,
+ edit_script **prev) {
+ edit_script *tp, *tp1;
+
+ tp = *head; *prev = NULL;
+ while (tp != NULL) {
+ while (((tp1 = tp->next) != NULL) && (tp->op_type == tp1->op_type)) {
+ tp->num = tp->num + tp1->num;
+ tp->next = tp1->next;
+ ckfree(tp1);
+ }
+ if (tp->next) *prev = tp;
+ else *tail = tp;
+ tp = tp->next;
+ }
+}
+
+
+
+
+void
+Sim4::pluri_align(int *dist_ptr,
+ Exon *theExons,
+ struct edit_script_list **Aligns,
+ sim4_stats_t *st) {
+ int i, end1, end2, diff, ali_dist;
+ char *a, *b;
+
+ Exon *thisExon = theExons;
+ Exon *nextExon;
+
+ int EditDistance = 0; // Sum of all tmpi, previously known as TMPI
+ int AlignmentLength = 0;
+
+ struct edit_script_list *enew;
+ struct edit_script *head;
+ struct edit_script *tmp_script;
+ struct edit_script *left;
+ struct edit_script *right;
+ struct edit_script *prev;
+
+ st->numberOfMatches = 0;
+ st->numberOfNs = 0;
+
+ head = 0L;
+ *Aligns = 0L;
+ *dist_ptr = ali_dist = 0;
+
+ end1 = _genLen;
+ end2 = _estLen;
+
+ nextExon = thisExon->next_exon;
+
+ while (nextExon && nextExon->toGEN) {
+ diff = thisExon->frEST - nextExon->toEST - 1;
+
+ if (diff != 0) {
+ if (thisExon->toGEN) {
+ enew = (edit_script_list *)ckalloc(sizeof(edit_script_list));
+ enew->next_script = *Aligns;
+ *Aligns = enew;
+ (*Aligns)->script = head;
+ (*Aligns)->offset1 = thisExon->frGEN;
+ (*Aligns)->offset2 = thisExon->frEST;
+ (*Aligns)->len1 = end1-(*Aligns)->offset1+1;
+ (*Aligns)->len2 = end2-(*Aligns)->offset2+1;
+ (*Aligns)->score = ali_dist;
+ ali_dist = 0;
+ head = NULL;
+ }
+ end1 = nextExon->toGEN;
+ end2 = nextExon->toEST;
+ } else {
+ diff = thisExon->frGEN - nextExon->toGEN - 1;
+ if (diff != 0) {
+ if (thisExon->toGEN) {
+ struct edit_script *newthing;
+
+ newthing = (edit_script *) ckalloc(sizeof(edit_script));
+ newthing->op_type = DELETE;
+ newthing->num = diff;
+ newthing->next = head;
+ head = newthing;
+ } else {
+ end1 = nextExon->toGEN;
+ }
+ }
+ }
+
+ if (globalParams->_interspecies) {
+ diff = get_dist(nextExon->frGEN-1, nextExon->frEST-1,
+ nextExon->toGEN, nextExon->toEST,
+ MAX(1000, (int)(globalParams->_percentError*(nextExon->toEST - nextExon->frEST + 1))));
+ } else { // original
+ diff = align_get_dist(nextExon->frGEN-1, nextExon->frEST-1,
+ nextExon->toGEN, nextExon->toEST,
+ MAX(1000, (int)(.2*(nextExon->toEST - nextExon->frEST + 1))));
+ }
+
+
+ // Return if the alignment fails.
+ //
+ if (diff < 0) {
+ st->numberOfMatches = 0;
+ st->numberOfNs = 0;
+ st->percentID = -1;
+
+ *Aligns = 0L;
+
+ return;
+ }
+
+#ifdef STATS
+ if (diff > P * (nextExon->toEST - nextExon->frEST + 1))
+ (void)printf("Warning: Distance threshold on segment exceeded.\n");
+#endif
+
+ if (globalParams->_interspecies) {
+ path(nextExon->frGEN-1, nextExon->frEST-1, SUBSTITUTE,
+ nextExon->toGEN, nextExon->toEST, SUBSTITUTE,
+ diff, &left, &right);
+ } else { // original
+ align_path(nextExon->frGEN-1, nextExon->frEST-1,
+ nextExon->toGEN, nextExon->toEST, diff, &left, &right);
+ }
+
+ // Return if the alignment fails -- this occurred once aligning
+ // dros frags to dros using snapper. Snapper was giving the wrong
+ // sequence for the seeds it also supplied.
+ //
+ if ((left == 0L) || (right == 0L)) {
+ st->numberOfMatches = 0;
+ st->numberOfNs = 0;
+ st->percentID = -1;
+ *Aligns = 0L;
+ return;
+ }
+
+ Condense_both_Ends(&left, &right, &prev);
+
+ if (!thisExon->toGEN && right->op_type == DELETE) {
+ /* remove gaps at end of alignment */
+ diff -= 0+right->num; /* subtract GAP_OPEN = 0 */
+ nextExon->toGEN -= right->num;
+ end1 -= right->num;
+ if (head && (head->op_type == DELETE))
+ head->num += right->num;
+ ckfree(right);
+ prev->next = NULL;
+ right = prev;
+ }
+
+ if ((!nextExon->next_exon || !nextExon->next_exon->toGEN) &&
+ left && (left->op_type == DELETE)) {
+ diff -= 0+left->num; /* subtract GAP_OPEN = 0 */
+ nextExon->frGEN += left->num;
+
+ tmp_script = left->next;
+ if (right == left)
+ right = tmp_script;
+ ckfree(left);
+ left = tmp_script;
+ }
+
+ *dist_ptr += diff;
+ ali_dist += diff;
+
+ a = _genSeq + nextExon->frGEN - 1;
+ b = _estSeq + nextExon->frEST - 1;
+
+ nextExon->numMatches = 0;
+ nextExon->numNs = 0;
+ nextExon->numInDel = 0;
+ nextExon->numEdits = 0;
+
+ tmp_script = left;
+
+ // These are used during SUBSTITUTE below to tell if the base at
+ // a (b) is N (upper or lower case).
+ //
+ bool an = false;
+ bool bn = false;
+
+ while (tmp_script) {
+ switch (tmp_script->op_type) {
+ case DELETE:
+ nextExon->numInDel += tmp_script->num;
+ nextExon->numEdits += tmp_script->num;
+ a += tmp_script->num;
+ break;
+ case INSERT:
+ nextExon->numInDel += tmp_script->num;
+ nextExon->numEdits += tmp_script->num;
+ b += tmp_script->num;
+ break;
+ case SUBSTITUTE:
+
+ // Count the number of matches and edits.
+ //
+ // An edit is a true substitute -- a base for a different base,
+ // not a base for an 'n'.
+ //
+ for (i=0; i<tmp_script->num; ++i, ++a, ++b) {
+
+ an = (*a == 'N') || (*a == 'n');
+ bn = (*b == 'N') || (*b == 'n');
+
+
+ if (an && bn) {
+ // Both are N. It isn't a match and it isn't an edit.
+ //
+ nextExon->numNs++;
+ } else if (an || bn) {
+ // One is an N. Someone has low quality sequence, and we
+ // should penalize. We need to special case this because
+ // IUPACidentity[][] claims N matches all.
+ //
+ nextExon->numEdits++;
+ } else if (IUPACidentity[(int)*a][(int)*b]) {
+ // Got a match.
+ nextExon->numMatches++;
+ } else {
+ // Got a substitution
+ nextExon->numEdits++;
+ }
+ }
+ break;
+ }
+ tmp_script = tmp_script->next;
+ }
+
+ nextExon->alignmentLength = (nextExon->toGEN - nextExon->frGEN + 1 +
+ nextExon->toEST - nextExon->frEST + 1 +
+ nextExon->numInDel);
+ nextExon->percentID = computePercentIdentity(nextExon->numEdits,
+ nextExon->alignmentLength);
+
+
+ st->numberOfMatches += nextExon->numMatches;
+ st->numberOfNs += nextExon->numNs;
+
+ EditDistance += nextExon->numEdits;
+ AlignmentLength += (nextExon->toGEN - nextExon->frGEN + 1 +
+ nextExon->toEST - nextExon->frEST + 1 +
+ nextExon->numInDel);
+
+ right->next = head;
+ head = left;
+
+ thisExon = nextExon;
+ nextExon = thisExon->next_exon;
+ }
+
+
+ /* at the beginning of the sequences */
+ if (nextExon!=NULL) {
+
+ if ((diff=thisExon->frEST-nextExon->toEST-1)!=0 && (diff != _estLen)) {
+ enew = (edit_script_list *)ckalloc(sizeof(edit_script_list));
+ enew->next_script = *Aligns;
+ *Aligns = enew;
+ (*Aligns)->offset1 = thisExon->frGEN;
+ (*Aligns)->offset2 = thisExon->frEST;
+ (*Aligns)->len1 = end1-(*Aligns)->offset1+1;
+ (*Aligns)->len2 = end2-(*Aligns)->offset2+1;
+ (*Aligns)->script = head;
+ (*Aligns)->score = ali_dist;
+
+ } else if (diff != _estLen) {
+
+ /* modified to cut introns at the beginning of the sequence */
+ enew = (edit_script_list *)ckalloc(sizeof(edit_script_list));
+ enew->next_script = *Aligns;
+ *Aligns = enew;
+ (*Aligns)->offset1 = thisExon->frGEN;
+ (*Aligns)->offset2 = 1;
+ (*Aligns)->len1 = end1-(*Aligns)->offset1+1;
+ (*Aligns)->len2 = end2-(*Aligns)->offset2+1;
+ (*Aligns)->script = head;
+ (*Aligns)->score = ali_dist;
+ }
+ }
+
+ st->percentID = computePercentIdentity(EditDistance, AlignmentLength);
+}
+
+
+
+
+void
+Sim4::updateStatistics(Exon *theExon,
+ sim4_stats_t *st) {
+
+ theExon = theExon->next_exon;
+
+ st->numberOfMatches = 0;
+ st->numberOfNs = 0;
+
+ int EditDistance = 0;
+ int AlignmentLength = 0;
+
+ while (theExon && theExon->toGEN) {
+ st->numberOfMatches += theExon->numMatches;
+ st->numberOfNs += theExon->numNs;
+
+ EditDistance += theExon->numEdits;
+ AlignmentLength += (theExon->toGEN - theExon->frGEN + 1 +
+ theExon->toEST - theExon->frEST + 1 +
+ theExon->numInDel);
+
+ theExon = theExon->next_exon;
+ }
+
+ st->percentID = computePercentIdentity(EditDistance, AlignmentLength);
+}
diff --git a/libsim4/sim4core/poly.C b/libsim4/sim4core/poly.C
new file mode 100644
index 0000000..76547a2
--- /dev/null
+++ b/libsim4/sim4core/poly.C
@@ -0,0 +1,571 @@
+#include <math.h>
+#include "sim4.H"
+
+#define MIN_EXON 12
+
+void
+Sim4::get_polyAT(char *seq, int len, int *pT, int *pA, int flag)
+{
+ register int i, sum10, sum20;
+ register char *s, *t, *v;
+ int last10;
+
+ int MAX10 = 2;
+ int MAX20 = 5;
+
+ char encodingA[128];
+ char encodingT[128];
+
+
+ if (flag!=T_ONLY) {
+ memset(encodingA, (char)1, 128);
+ encodingA[(int)'A'] = encodingA[(int)'X'] = encodingA[(int)'N'] = 0;
+
+ for (i=0, s=seq+len, sum10=0, last10=len+1; i<10 && s>seq && sum10<=MAX20; i++) {
+ sum10 += encodingA[(int)*(--s)];
+ /* if (!encodingA[*s] && sum10<=MAX10) last10 = s-seq+1; */
+ }
+
+ t = v = seq+len;
+ sum20 = sum10;
+ for ( ; s>=seq && (sum10<=MAX10 || sum20<=MAX20); ) {
+ if (!encodingA[(int)*s] && sum10<=MAX10 && (seq+len>=s+20 || sum20<MAX10))
+ last10 = (int)(s-seq+1);
+ if (--s>seq) {
+ sum10 += encodingA[(int)*s] - encodingA[(int)*(--t)];
+ sum20 += encodingA[(int)*s] -(((seq+len)-s>20) ? encodingA[(int)*(--v)] : 0);
+ }
+ }
+
+ if (last10>len-10) *pA = len+1;
+ else {
+ s = seq+last10+8;
+ while (s >= seq && !encodingA[(int)*s]) s--;
+ if ((s-seq+1)-last10+1<=5)
+ *pA = (int)(s-seq+2);
+ else
+ *pA = last10;
+ }
+ } else *pA = len+1;
+ *pA = len-(*pA)+1;
+
+ if (flag!=A_ONLY) {
+
+ memset(encodingT, (char)1, 128);
+ encodingT[(int)'T'] = encodingT[(int)'X'] = encodingT[(int)'N'] = 0;
+
+ for (i=0, s=seq-1, sum10=0, last10=0; i<10 && i<len-1 && sum10<=MAX20; i++) {
+ sum10 += encodingT[(int)*(++s)];
+ /* if (!encodingT[*s] && sum10<=MAX10) last10 = s-seq+1; */
+ }
+
+ t = v = seq-1;
+ sum20 = sum10;
+ for ( ; s<seq+len && (sum10<=MAX10 || sum20<=MAX20); ) {
+ if (!encodingT[(int)*s] && sum10<=MAX10 && (s-seq>=19 || sum20<MAX10))
+ last10 = (int)(s-seq+1);
+ if (++s<seq+len) {
+ sum10 += encodingT[(int)*s] - encodingT[(int)*(++t)];
+ sum20 += encodingT[(int)*s] - ((s-seq>=20) ? encodingT[(int)*(++v)] : 0);
+ }
+ }
+
+ if (last10<=10) *pT = 0;
+ else {
+ s = seq+last10-10;
+ while (s < seq+len && !encodingT[(int)*s]) s++;
+ if (last10-(s-seq)+1<=5)
+ *pT = (int)(s-seq);
+ else
+ *pT = last10;
+ }
+ } else *pT = 0;
+}
+
+void
+Sim4::trim_polyA_align(struct edit_script_list **Sptr, Exon *lblock, Exon **exons, const int bc, int *pA, char *s1,char *s2)
+{
+ edit_script_list *head = *Sptr;
+ edit_script *tp;
+ int tmpi = 0, num, idents = 0, identsN = 0;
+ char *a, *b;
+ Exon *prev;
+
+ int i, j; /* i index in the cDNA */
+
+ if (bc>head->offset2+head->len2-1) {
+ *pA = bc;
+ return;
+ }
+
+ if (bc==head->offset2) {
+ /* cDNA gap: remove the entire script; is this properly sorted? LLL */
+ *Sptr = head->next_script;
+ Free_script(head->script);
+ ckfree(head);
+ while ((*exons)->frEST>=bc) {
+ prev = find_previous(lblock,*exons);
+
+ if (prev == 0L) {
+ fprintf(stderr, "trim_polyA_align(): Corrupted exon list, cannot find the previous exon (remove entire script).\n");
+ for (; lblock; lblock = lblock->next_exon)
+ fprintf(stderr, " GEN f=%8d t=%8d EST f=%8d t=%8d flag=%d\n",
+ lblock->frGEN, lblock->toGEN, lblock->frEST, lblock->toEST, lblock->flag);
+ kill(getpid(), SIGKILL);
+ }
+
+ prev->next_exon = (*exons)->next_exon;
+ //freeExon(*exons); garbage collected
+ *exons = prev;
+ }
+ *pA = bc;
+ return;
+ }
+
+ Flip_script(&(head->script));
+ i = head->offset2 + head->len2 -1;
+ j = head->offset1 + head->len1 -1;
+ tp = head->script;
+
+ while (i>=bc && tp) {
+ num = tp->num;
+ switch (tp->op_type) {
+ case INSERT:
+ if (i>=bc && bc>i-num+1) {
+ (*exons)->numInDel -= i - bc + 1;
+ (*exons)->numEdits -= i - bc + 1;
+ tmpi += i-bc+1;
+ tp->num -= i-bc+1;
+ i = bc-1;
+ } else {
+ (*exons)->numInDel -= num;
+ (*exons)->numEdits -= num;
+ tmpi += num;
+ i -= num;
+ head->script = tp->next;
+ ckfree(tp);
+ tp = head->script;
+ }
+ break;
+ case DELETE:
+ (*exons)->numInDel -= num;
+ (*exons)->numEdits -= num;
+ j -= num;
+ tmpi += num;
+ head->script = tp->next;
+ ckfree(tp);
+ tp = head->script;
+ break;
+ case SUBSTITUTE:
+ if (i>=bc && bc>i-num+1) {
+ a = s2+i-1; b = s1+j-1;
+ while (a>=s2+bc-1) {
+ if (*a != *b) {
+ (*exons)->numEdits--;
+ tmpi++;
+ } else {
+ if (*a == 'N') {
+ (*exons)->numNs--;
+ identsN++;
+ } else {
+ (*exons)->numMatches--;
+ idents++;
+ }
+ }
+ a--;
+ b--;
+ }
+ j -= i-bc+1; tp->num -= i-bc+1; i = bc-1;
+ } else {
+ /* at most 1 nt remaining */
+ a = s2+i-1; b = s1+j-1;
+ while (a>=s2+i-num) {
+ if (*a != *b) {
+ (*exons)->numEdits--;
+ tmpi++;
+ } else {
+ if (*a == 'N') {
+ (*exons)->numNs--;
+ identsN++;
+ } else {
+ (*exons)->numMatches--;
+ idents++;
+ }
+ }
+ a--;
+ b--;
+ }
+
+ i -= num; j -= num;
+ head->script = tp->next;
+ ckfree(tp);
+ tp = head->script;
+ }
+ break;
+#if 0
+ default:
+ fatalf("Unrecognized opcode %d.\n",tp->op_type);
+#endif
+ }
+ /* indel walk */
+ }
+ assert(i==bc-1);
+
+ while ((tp != 0L) &&
+ (tp->op_type != SUBSTITUTE) && (j+1 >= (*exons)->frGEN)) {
+ if (tp->op_type==INSERT) {
+ i -= tp->num;
+ tmpi += tp->num;
+ (*exons)->numInDel -= tp->num;
+ (*exons)->numEdits -= tp->num;
+ } else if (j<(*exons)->frGEN && i<(*exons)->frEST) {
+ j -= tp->num;
+ } else {
+ j -= tp->num;
+ tmpi += tp->num;
+ (*exons)->numInDel -= tp->num;
+ (*exons)->numEdits -= tp->num;
+ }
+ head->script = tp->next;
+ ckfree(tp);
+ tp = head->script;
+ }
+
+ if (head->script==NULL) {
+ *Sptr = head->next_script;
+ ckfree(head);
+ } else {
+ head->len1 = j-head->offset1+1;
+ head->len2 = i-head->offset2+1;
+ head->score -= tmpi;
+ Flip_script(&(head->script));
+ }
+
+ if ((*exons)->frEST>i) {
+ prev = find_previous(lblock,*exons);
+
+ if (prev == 0L) {
+ fprintf(stderr, "trim_polyA_align(): Corrupted exon list, cannot find the previous exon (frEST).\n");
+ for (; lblock; lblock = lblock->next_exon)
+ fprintf(stderr, " GEN f=%8d t=%8d EST f=%8d t=%8d flag=%d\n",
+ lblock->frGEN, lblock->toGEN, lblock->frEST, lblock->toEST, lblock->flag);
+ kill(getpid(), SIGKILL);
+ }
+
+ prev->next_exon = (*exons)->next_exon;
+ //freeExon(*exons); garbage collected
+ *exons = prev;
+ } else {
+ (*exons)->toEST = i;
+ (*exons)->toGEN = j;
+ (*exons)->length = (*exons)->toEST-(*exons)->frEST+1;
+
+ (*exons)->alignmentLength = ((*exons)->toGEN - (*exons)->frGEN + 1 +
+ (*exons)->toEST - (*exons)->frEST + 1 +
+ (*exons)->numInDel);
+ (*exons)->percentID = computePercentIdentity((*exons)->numEdits,
+ (*exons)->alignmentLength);
+ }
+ *pA = i+1;
+
+ return;
+}
+
+
+
+void
+Sim4::remove_polyA_back(struct edit_script_list **Sptr, Exon *Exons,
+ char *s1, char *s2,
+ int l2, int *lastA) {
+ Exon *t;
+ Exon *exons_tail;
+ char *b, *end;
+ int numA, pA, dummy, trim_p, reverse_script=0;
+ int startPos=0, cutAmount=0;
+
+ *lastA = l2+1; pA = 0;
+ if (!Exons || ! Exons->next_exon || ! Exons->next_exon->toGEN) return;
+
+ if ((*Sptr)->next_script &&
+ (*Sptr)->offset1<(*Sptr)->next_script->offset1) {
+ reverse_script = 1;
+ script_flip_list(Sptr);
+ }
+
+ exons_tail = Exons->next_exon;
+ while (exons_tail->next_exon && exons_tail->next_exon->toGEN)
+ exons_tail=exons_tail->next_exon;
+
+ trim_p = 1;
+
+ if (exons_tail) {
+ startPos = exons_tail->toEST;
+
+ while ((t=exons_tail)!=NULL && t->toGEN && trim_p) {
+ /* compute the 'A' contents of the exon */
+ b = s2 + t->toEST-1; end = s2+t->frEST-1; numA = 0;
+ while (b>=end && numA+(b-end)>=globalParams->_polyTailPercent*t->length) {
+ numA += (*b--=='A');
+ }
+
+ // Determine how much of the cut stuff was actually
+ // poly-containing. The first method below returns the number of
+ // bases cut from the end of the est, while the second return the
+ // number of bases cut from the end of the alignment.
+ //
+ //cutAmount = l2 - *lastA + 1;
+
+ if (numA>=globalParams->_polyTailPercent*t->length) {
+ /* remove the entire exon */
+ trim_polyA_align(Sptr,Exons,&exons_tail,t->frEST,lastA,s1,s2);
+ cutAmount = startPos - *lastA + 1;
+ } else {
+ get_polyAT(s2+(*Sptr)->offset2-1,(*Sptr)->len2,&dummy,&pA,A_ONLY);
+ if (pA) {
+ int ct_pA;
+ /* first position to be removed */
+ ct_pA = t->toEST-pA+1;
+ ct_pA = (ct_pA-t->frEST>MIN_EXON) ? ct_pA : t->frEST;
+ /* note: pA is the last (innermost) position in the tail */
+ trim_polyA_align(Sptr,Exons,&exons_tail,ct_pA,lastA,s1,s2);
+ cutAmount = startPos - *lastA + 1;
+ }
+ if (t==exons_tail) trim_p = 0;
+ }
+ }
+ }
+
+ *lastA = cutAmount;
+
+ if (reverse_script) script_flip_list(Sptr);
+}
+
+
+
+/* s2 is the cdna */
+void
+Sim4::trim_polyT_align(struct edit_script_list **Sptr, Exon **exons, const int ec, int *pT, char *s1, char *s2)
+{
+ edit_script_list *head = *Sptr;
+ edit_script *tp;
+ int tmpi = 0, num, idents = 0, identsN = 0;
+ char *a, *b;
+ Exon *t;
+
+ int i, j; /* i index in the cDNA */
+
+ if (ec<head->offset2) {
+ *pT = ec;
+ return;
+ }
+
+ if (ec==head->offset2+head->len2-1) {
+ /* cDNA gap: remove the entire script */
+ *Sptr = head->next_script;
+ Free_script(head->script);
+ ckfree(head);
+ while ((*exons)->frEST<ec) {
+ t = *exons;
+ *exons = t->next_exon;
+ //freeExon(t); garbage collected
+ }
+ *pT = ec;
+ return;
+ }
+
+ i = head->offset2;
+ j = head->offset1;
+ tp = head->script;
+
+ while (i<=ec && tp) {
+ num = tp->num;
+ switch (tp->op_type) {
+ case INSERT:
+ if (i<=ec && ec<i+num-1) {
+ (*exons)->numInDel -= ec - i + 1;
+ (*exons)->numEdits -= ec - i + 1;
+ tmpi += ec-i+1;
+ tp->num -= ec-i+1;
+ i = ec+1;
+ } else {
+ (*exons)->numInDel -= num;
+ (*exons)->numEdits -= num;
+ tmpi += num;
+ i += num;
+ head->script = tp->next;
+ ckfree(tp);
+ tp = head->script;
+ }
+ break;
+ case DELETE:
+ (*exons)->numInDel -= num;
+ (*exons)->numEdits -= num;
+ j += num;
+ tmpi += num;
+ head->script = tp->next;
+ ckfree(tp);
+ tp = head->script;
+ break;
+ case SUBSTITUTE:
+ if (i<=ec && ec<i+num-1) {
+ a = s2+i-1; b = s1+j-1;
+ while (a<s2+ec) {
+ if (*a != *b) {
+ (*exons)->numEdits--;
+ tmpi++;
+ } else {
+ if (*a == 'N') {
+ (*exons)->numNs--;
+ identsN++;
+ } else {
+ (*exons)->numMatches--;
+ idents++;
+ }
+ }
+ a++;
+ b++;
+ }
+ j += ec-i+1; tp->num -= ec-i+1; i = ec+1;
+ } else {
+ /* at most 1 nt remaining */
+ a = s2+i-1; b = s1+j-1;
+ while (a<s2+i+tp->num-1) {
+ if (*a != *b) {
+ (*exons)->numEdits--;
+ tmpi++;
+ } else {
+ if (*a == 'N') {
+ (*exons)->numNs--;
+ identsN++;
+ } else {
+ (*exons)->numMatches--;
+ idents++;
+ }
+ }
+ a++;
+ b++;
+ }
+
+ i +=num; j += num;
+ head->script = tp->next;
+ ckfree(tp);
+ tp = head->script;
+ }
+ break;
+ }
+ /* indel walk */
+ }
+ assert(i==ec+1);
+
+ while ((tp != 0L) &&
+ (tp->op_type!=SUBSTITUTE) && (j-1<=(*exons)->toGEN)) {
+ if (tp->op_type==INSERT) {
+ i += tp->num;
+ tmpi += tp->num;
+ (*exons)->numInDel -= tp->num;
+ (*exons)->numEdits -= tp->num;
+ } else if (j>=(*exons)->toGEN && i>=(*exons)->toEST) {
+ j += tp->num;
+ } else {
+ j += tp->num;
+ tmpi += tp->num;
+ (*exons)->numInDel -= tp->num;
+ (*exons)->numEdits -= tp->num;
+ }
+ head->script = tp->next;
+ ckfree(tp);
+ tp = head->script;
+ }
+
+ if (head->script==NULL) {
+ *Sptr = head->next_script;
+ ckfree(head);
+ } else {
+ head->len1 -= j-head->offset1;
+ head->len2 -= i-head->offset2;
+ head->offset2 = i;
+ head->offset1 = j;
+ head->score -= tmpi;
+ }
+
+ if ((*exons)->toEST<i) {
+ t = *exons;
+ *exons = t->next_exon;
+ //freeExon(t); garbage collected
+ } else {
+ (*exons)->frEST = i;
+ (*exons)->frGEN = j;
+ (*exons)->length = (*exons)->toEST-(*exons)->frEST+1;
+
+ (*exons)->alignmentLength = ((*exons)->toGEN - (*exons)->frGEN + 1 +
+ (*exons)->toEST - (*exons)->frEST + 1 +
+ (*exons)->numInDel);
+ (*exons)->percentID = computePercentIdentity((*exons)->numEdits,
+ (*exons)->alignmentLength);
+ }
+ *pT = i-1;
+ return;
+}
+
+
+
+void
+Sim4::remove_polyT_front(struct edit_script_list **Sptr, Exon *Exons, char *s1, char *s2, int *lastT)
+{
+ Exon *t, *exons_head; /* start from Lblock */
+ char *b, *end;
+ int numT, dummy, trim_p, reverse_script=0, pT;
+ int startPos=0, cutAmount=0;
+
+ *lastT = pT = 0;
+ if (!Exons || !Exons->next_exon || !Exons->next_exon->toGEN) return;
+
+ if ((*Sptr)->next_script &&
+ (*Sptr)->offset1>(*Sptr)->next_script->offset1) {
+ script_flip_list(Sptr);
+ reverse_script = 1;
+ }
+
+ exons_head = Exons->next_exon; trim_p = 1;
+
+ if (exons_head) {
+ startPos = exons_head->frEST;
+
+ while ((t=exons_head)!=NULL && t->toGEN && trim_p) {
+ /* compute the 'T' contents of the exon */
+ b = s2 + t->frEST-1; end = s2+t->toEST; numT = 0;
+ while (b<end && (numT+t->toEST-(b-s2)>=globalParams->_polyTailPercent*t->length)) {
+ numT += (*b++=='T');
+ }
+
+ // Determine how much of the cut stuff was actually
+ // poly-containing. The first method below returns the number of
+ // bases cut from the end of the est, while the second return the
+ // number of bases cut from the end of the alignment.
+ //
+ //cutAmount = l2 - *lastT + 1;
+
+ if (numT>=globalParams->_polyTailPercent*t->length) {
+ /* remove the entire exon */
+ trim_polyT_align(Sptr,&exons_head,t->toEST,lastT,s1,s2);
+ cutAmount = *lastT - startPos + 1;
+ } else {
+ get_polyAT(s2+(*Sptr)->offset2-1,(*Sptr)->len2,&pT,&dummy,T_ONLY);
+ if (pT) {
+ int ct_pT;
+ ct_pT = pT + (*Sptr)->offset2-1;
+ ct_pT = (t->toEST-ct_pT>MIN_EXON) ? ct_pT : t->toEST;
+ trim_polyT_align(Sptr,&exons_head,ct_pT,lastT,s1,s2);
+ cutAmount = *lastT - startPos + 1;
+ }
+ if (t==exons_head) trim_p = 0;
+ }
+ }
+ }
+
+ Exons->next_exon = exons_head;
+
+ *lastT = cutAmount;
+
+ if (reverse_script) script_flip_list(Sptr);
+}
diff --git a/libsim4/sim4core/sim4.H b/libsim4/sim4core/sim4.H
new file mode 100644
index 0000000..b5776d4
--- /dev/null
+++ b/libsim4/sim4core/sim4.H
@@ -0,0 +1,652 @@
+#ifndef SIM4_H
+#define SIM4_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <ctype.h>
+#include <errno.h>
+
+#include "util++.H"
+
+#include "sim4parameters.H"
+#include "sim4command.H"
+#include "exon.H"
+#include "mspManager.H"
+#include "sim4defines.H"
+#include "glimmerSplice.H"
+#include "sim4b1_s.H"
+
+#include "../sim4polish/sim4polish.H"
+#include "../sim4polish/sim4polishList.H"
+#include "../sim4polish/sim4polishBuilder.H"
+
+//
+// A wrapper for the sim4 subroutines
+//
+
+
+// Define this to use our own memory management -- one that doesn't bother to
+// release memory during the compute (but does release it all at the end).
+//
+
+// Define this to get a detailed report on hit extension (in mspManager.C)
+//
+//#define DEBUG_EXTENSION
+
+// Define this to test the seeds when doing extension -- it tests
+// if the seed is infact an exact match (in mspManager.C)
+//
+//#define TEST_SEEDS_IN_EXTENSION
+
+
+// Debug of sim4b1.C and related.
+//
+// SHOW_PROGRESS -- write the progress of Sim4::SIM4 to stderr
+// DEBUG_EXONS -- dump the exons at various places
+//
+//#define SHOW_PROGRESS
+//#define DEBUG_EXONS
+
+
+// Show what is being done for external seeding in sim4string.C
+//
+//#define SHOW_EXTERNAL_SEEDING
+
+enum { INIT = 0, PERM = 1, TEMP = 2};
+enum { FWD = 0, BWD = 1, BOTH = 2 };
+enum { OK = 0, FREE_START = 1, FREE_END = 2, FREE_BOTH_ENDS = 3};
+
+#define GEN_LOG4_ENTRIES 45
+#define CDNA_LOG4_ENTRIES 25
+
+#define HASH_SIZE 32767 /* 2**15 - 1 */
+
+
+class Sim4 {
+private:
+
+struct coords {
+ int pos1;
+ int pos2;
+};
+
+/* used only in the alignment stage */
+struct edit_script {
+ int op_type; /* SUB, INS, or DEL */
+ int num; /* Number of operations */
+ struct edit_script *next;
+};
+
+struct edit_script_list {
+ int offset1, offset2;
+ int len1, len2;
+ int score;
+ struct edit_script *script;
+ struct edit_script_list *next_script;
+};
+
+struct splice_t {
+ int xs;
+ int xe;
+ int ys;
+ int ye;
+ int score;
+ int type;
+ struct splice_t *next;
+};
+
+struct sim4_stats_t {
+ int internal;
+ int icoverage;
+ int numberOfMatches;
+ int numberOfNs;
+ int orientation;
+ int percentID;
+ bool tooManyMSPs;
+};
+
+struct hash_node {
+ int ecode; /* integer encoding of the word */
+ int pos; /* positions where word hits query sequence */
+ struct hash_node *link; /* next word with same last 7.5 letters */
+};
+
+struct hash_table {
+ struct hash_node *table[HASH_SIZE+1];
+ int *nextPos;
+ struct hash_node *nodes;
+ int nodesused;
+};
+
+
+private:
+ mspManager _mspManager;
+
+ exonManager _exonManager;
+
+ sim4parameters *globalParams;
+
+ void *_pallochandle;
+
+ char *_genSeq;
+ char *_estSeq;
+
+ int _genLen;
+ int _estLen;
+
+ int encoding[256];
+
+ coords last_GT;
+ coords last_CT;
+ coords last_AG;
+ coords last_AC;
+
+ //sim4_args_t rs;
+
+ int wordExtensionAllowance;
+
+ int wordSize;
+ int wordSizeInt;
+ int wordSizeExt;
+
+ char *spacedSeed;
+ char *spacedSeedInt;
+ char *spacedSeedExt;
+
+ mss_t spacedSeedMSS;
+ mss_t spacedSeedIntMSS;
+ mss_t spacedSeedExtMSS;
+
+ int spliceModel;
+
+ int mspThreshold1;
+ int mspThreshold2;
+
+ int G_score;
+ int C_score;
+ Exon *exon_list;
+
+ int mask;
+
+ struct hash_table *hashtable;
+ struct hash_table phashtable;
+ struct hash_table thashtable;
+
+ // For high-accuracy switch
+ //
+ bool _accurateSequences;
+
+#ifdef __APPLE__
+ bool _pad[7];
+#endif
+
+
+
+ void adjustBoundariesOfMarginalExons(Exon *Lblock);
+ void findLastAGandAC(Exon *tmp_block1);
+ void findLastGTandCT(Exon *tmp_block);
+
+
+ // Functions from Xtend1.c
+ //
+ typedef struct ValNode {
+ void *data;
+ struct ValNode *next;
+ } *ValNodePtr;
+
+ void free_coords(coords ***val, int size) {
+ for(int i=0; i<size; ++i)
+ ckfree(val[i]);
+ ckfree(val);
+ };
+
+ int EXTEND_BW(char *a, char *b, int c, int d, int e, int f, int *g, int *h) {
+ if (_accurateSequences) {
+ fprintf(stderr, "Xextend_bw -- Xextend_bw NOT OPTIMIZED\n");
+ return(Xextend_bw(a,b,c,d,e,f,g,h));
+ } else {
+ return(extend_bw(a,b,c,d,e,f,g,h));
+ }
+ };
+
+ int EXTEND_FW(char *a, char *b, int c, int d, int e, int f, int *g, int *h) {
+ if (_accurateSequences) {
+ fprintf(stderr, "Xextend_bw -- Xextend_fw NOT OPTIMIZED\n");
+ return(Xextend_fw(a,b,c,d,e,f,g,h));
+ } else {
+ return(extend_fw(a,b,c,d,e,f,g,h));
+ }
+ };
+
+ void Xextend_link_to_data_list(void *data, ValNodePtr *head, ValNodePtr *prev);
+ void Xextend_ValNodeFreeData(ValNodePtr data_list);
+ int Xextend_bw(char *,char *,int,int,int,int,int *,int *);
+ int Xextend_fw(char *,char *,int,int,int,int,int *,int *);
+
+
+
+
+ // Functions from align.c
+ //
+ void path(int,int,char,int,int,char,int,edit_script**,edit_script**);
+ void align_path(int,int,int,int,int,edit_script**,edit_script**);
+ int get_dist(int, int, int, int, int);
+ int align_get_dist(int, int, int, int, int);
+ void Condense_script(edit_script *);
+ void Condense_both_Ends(edit_script **, edit_script **, edit_script **);
+ void Flip_script(struct edit_script **);
+#ifdef AUXUTILS
+ void Reverse_script(edit_script *);
+ void Print_script(edit_script *head, int M, int N);
+#endif
+ void S2A(edit_script *, int *);
+ void align_reverse(int *);
+ void Free_script(edit_script *);
+
+ int snake(int k, int x, int endx, int endy) {
+ if (x<0) return(x);
+ int y = x+k;
+ while ((x<endx) && (y<endy) && (_genSeq[x] == _estSeq[y])) {
+ ++x;
+ ++y;
+ }
+ return(x);
+ };
+
+
+ int rsnake(int k, int x, int startx, int starty, int m) {
+ if (x>m) return(x);
+ int y = x+k;
+ while ((x>startx) && (y>starty) && (_genSeq[x-1] == _estSeq[y-1])) {
+ --x;
+ --y;
+ }
+ return(x);
+ };
+
+
+ bool checkExonsForOverlaps(Exon *theExons);
+ void appendExons(sim4polishBuilder &B, Exon *theExon);
+ void maskExonsFromSeeds(sim4command *cmd, Exon *theExons);
+ void maskExonsFromGenomic(Exon *theExons, char *f, char *r, int l);
+
+ void IDISPLAY(sim4polishBuilder &builder,
+ char *aString,
+ char *bString,
+ char A[],
+ char B[],
+ int M,
+ int N,
+ int S[],
+ int AP,
+ int BP,
+ int est_strand,
+ Exon *exons);
+ void appendAlignments(sim4polishBuilder &builder,
+ char *seq1,
+ char *seq2,
+ int len1,
+ int len2,
+ edit_script_list *Aligns,
+ Exon *Exons,
+ int match_ori);
+
+
+
+
+
+
+
+
+ // Functions from sim4b1.h
+ //
+ int good_ratio(int);
+ void flip_list(Exon **,Exon **);
+ void free_align(edit_script_list *);
+ void complement_exons(Exon **,int,int);
+
+
+ void bld_table(char *,int,mss_t,int);
+
+ int SIM4_block1(Exon* &Lblock,
+ Exon* &tmp_block,
+ Exon* &tmp_block1);
+ int SIM4_block2(Exon* &tmp_Lblock,
+ Exon* &tmp_Rblock,
+ Exon* &tmp_block,
+ Exon* &tmp_block1);
+ int SIM4_block3(bool good_match,
+ Exon* &tmp_Lblock,
+ Exon* &tmp_Rblock,
+ Exon* &tmp_block,
+ Exon* &tmp_block1);
+ int SIM4_block4(bool good_match,
+ Exon* &tmp_Lblock,
+ Exon* &tmp_Rblock,
+ Exon* &tmp_block,
+ Exon* &tmp_block1);
+
+ struct edit_script_list *SIM4(int *,
+ Exon* *,
+ int *pA,
+ int *pT,
+ sim4_stats_t *);
+
+ void merge(Exon **,Exon **);
+ bool get_sync_flag(Exon *, Exon *, int);
+ void slide_intron(int w, Exon *, Exon *, int, sim4_stats_t *);
+ void sync_slide_intron(int w, Exon *, Exon *, int, sim4_stats_t *);
+ void filter(Exon **,Exon **);
+ void wobble(Exon *, Exon *, const char *, const char *, char *seq1);
+ Exon *bmatch(char *,char *,int,int,int,int);
+ Exon *fmatch(char *,char *,int,int,int,int);
+ void compact_list(Exon **Lblock, Exon **Rblock, int SI);
+ int resolve_overlap(Exon *,Exon *,char *);
+ int greedy(char *,char *,int,int,int,int,Exon **, Exon **);
+ int extend_bw(char *,char *,int,int,int,int,int *,int *);
+ int extend_fw(char *,char *,int,int,int,int,int *,int *);
+
+ void SLIDE_INTRON(int a, Exon *b, Exon *c, int d, sim4_stats_t *e, int f) {
+ if (f == 1) {
+ return(sync_slide_intron(a,b,c,d,e));
+ } else {
+ return(slide_intron(a,b,c,d,e));
+ }
+ };
+
+ void pluri_align(int *dist_ptr,
+ Exon *lblock,
+ struct edit_script_list **Aligns,
+ sim4_stats_t *st);
+ void updateStatistics(Exon *theExon,
+ sim4_stats_t *st);
+
+ void get_stats(Exon *,sim4_stats_t *);
+ int get_edist(int,int,int,int,char *,char *);
+
+ void add_word(int,int);
+ int extend_hit(int,int,const char *const,const char * const,int,int,int,int);
+ void search(char *,char *,int,int,mss_t);
+ void exon_cores(char*,char*,int,int,int,int,int,mss_t,int,int);
+
+ Exon *find_previous(Exon *head, Exon *target) {
+ while (head && (head->next_exon != target))
+ head = head->next_exon;
+ return(head);
+ };
+
+ bool get_match_quality(Exon *,Exon *,sim4_stats_t *,int);
+#if 0
+ // Dead code, 05 apr 2004, bpw
+ void check_consistency_intron_ori(Exon *,int,char *);
+#endif
+
+
+ // sim4b1_s.h
+ //
+ public:
+ mss_t masks_shifts(char *);
+ int mask_shift(uint64, mss_t);
+
+private:
+ // splice.h
+ //
+
+#define MAX_SPAN 80
+
+ void splice_donor(char *xseq, char *yseq, int M, int N, double *gt_score,
+ double *ct_score, double **max_Gf, double **max_Cf,
+ int **start_Gi, int **start_Ci);
+ void splice_donor_uni(char *xseq, char *yseq, int M, int N,
+ double *It_score, double **max_IF, int **end_Ii);
+ void splice_acceptor(char *xseq, char *yseq, int M, int N,
+ double *ag_score, double *ac_score, double **max_Gb,
+ double **max_Cb, int **end_Gi, int **end_Ci);
+ void splice_acceptor_uni(char *xseq, char *yseq, int M, int N,
+ double *aI_score, double **max_Ib, int **end_Ii);
+ void splice_init(int spl_model);
+ void splice_close();
+
+ void loadGeneSplicerModel (void);
+ void loadGlimmerModel (char *train_dir);
+ double ScoreDonor_Glimmer (char *asegment, char *train_dir);
+ double ScoreAcceptor_Glimmer (char *asegment, char *train_dir);
+
+ void splice_original(char *in_seqx, int ls, int us, int le, int ue,
+ char *in_seqy, int ys, int ye,
+ double *gtscore, double *agscore,
+ double *ctscore, double *acscore, int ori,
+ char *nsgemnetL, char *nsegmentR);
+ void splice_GeneSplicer(char *in_seqx, int ls, int us, int le, int ue,
+ char *in_seqy, int ys, int ye,
+ double *gtscore, double *agscore,
+ double *ctscore, double *acscore, int ori,
+ char *nsgemnetL, char *nsegmentR);
+ void splice_Glimmer(char *in_seqx, int ls, int us, int le, int ue,
+ char *in_seqy, int ys, int ye,
+ double *gtscore, double *agscore,
+ double *ctscore, double *acscore, int ori,
+ char *nsgementL, char *nsegmentR,
+ char *asegmentL, char *asegmentR);
+
+ int stepct(int n) {
+ if (n<0) fatal("splice.c: Negative value in stepct().");
+ if (n<=4) return 9; // return((int)1.6*8);
+ if (n<=8) return 10; // return((int)1.6*9);
+ if (n<=12) return 12; // return((int)1.6*10);
+ return 12; // return((int)1.6*11);
+ };
+
+ splice_t *new_splice(char,int,int,int,int,double,splice_t *);
+ void splice(char *in_seqx, int ls, int us, int le, int ue, char *in_seqy, int ys, int ye,
+ splice_t **gcell, splice_t **ccell, int ori, int spl_model);
+
+
+ // Functions and defines from GeneSplicer header files: sites.h
+ //
+
+#define ALPHABET_SIZE 4
+
+ void init_GeneSPlicer ();
+ double ScoreAcceptor_GeneSplicer (char *);
+ double ScoreDonor_GeneSplicer (char *);
+ void UnLoadSites_GeneSplicer ();
+
+ // ... sites_score.h
+
+#define NUM_VALUES_SCORES 2560
+
+ double score_ex_acc[NUM_VALUES_SCORES];
+ double score_in_acc[NUM_VALUES_SCORES];
+ double score_ex_don[NUM_VALUES_SCORES];
+ double score_in_don[NUM_VALUES_SCORES];
+
+ // ... sites_donor.h
+
+#define NUM_MODELS_DON 25
+#define NUM_VALUES_DON 928
+
+ double don[NUM_MODELS_DON][NUM_VALUES_DON];
+
+ // ... sites_acceptor.h
+
+#define NUM_MODELS_ACC 25
+#define NUM_VALUES_ACC 928
+
+ double acc[NUM_MODELS_ACC][NUM_VALUES_ACC];
+
+
+ // Functions from misc.h
+ //
+ void fatal(const char *msg) {
+ fflush(stdout);
+ fprintf(stderr, "%s\n", msg);
+ fflush(stderr);
+ kill(getpid(), SIGKILL);
+ };
+
+
+ // Poly-A/T masking stuff
+ //
+#define T_ONLY 1
+#define A_ONLY 2
+#define BOTH_AT 3
+
+ void get_polyAT(char *seq,
+ int len,
+ int *pA,
+ int *pT,
+ int flag=BOTH_AT);
+
+ //void remove_poly(struct edit_script_list **,Exon *,char *,char *,int,int *,int *);
+
+
+ void remove_polyA_back(struct edit_script_list **,Exon *,char *,char*,int,int *);
+ void remove_polyT_front(struct edit_script_list **,Exon *,char *,char*,int *);
+ void trim_polyT_align(struct edit_script_list **,Exon **,const int,int *,char *,char *);
+ void trim_polyA_align(struct edit_script_list **,Exon *,Exon **,const int,int *,char *,char *);
+
+
+
+/* reverse a list of edit script chains */
+void
+script_flip_list(edit_script_list **left) {
+ edit_script_list *ep, *ahead, *behind;
+
+ ahead = *left;
+ ep = NULL;
+ while (ahead!=NULL) {
+ behind = ep; ep = ahead;
+ ahead = ahead->next_script;
+ ep->next_script = behind;
+ }
+ *left = ep;
+}
+
+ int computePercentIdentity(int numEdits, int alignLen) {
+ if (alignLen == 0)
+ return(0);
+ if (numEdits == 0)
+ return 100;
+ int pctId = (int)(round(100.0 * (1 - 2.0 * numEdits / alignLen)));
+
+ return ((pctId < 100) ? pctId : 99);
+ };
+
+
+public:
+ Sim4(sim4parameters *p) {
+ globalParams = p;
+
+ _pallochandle = pallochandle(64 * 1024);
+
+ _genSeq = 0L;
+ _estSeq = 0L;
+ _genLen = 0;
+ _estLen = 0;
+
+ for (uint32 i=256; i;)
+ encoding[--i] = -1;
+ encoding[(int)'A'] = encoding[(int)'a'] = 0;
+ encoding[(int)'C'] = encoding[(int)'c'] = 1;
+ encoding[(int)'G'] = encoding[(int)'g'] = 2;
+ encoding[(int)'T'] = encoding[(int)'t'] = 3;
+
+ last_GT.pos1 = last_GT.pos2 = 0;
+ last_CT.pos1 = last_CT.pos2 = 0;
+ last_AG.pos1 = last_AG.pos2 = 0;
+ last_AC.pos1 = last_AC.pos2 = 0;
+
+ wordExtensionAllowance = 12;
+
+ _mspManager.setLimits(globalParams->_mspLimitAbsolute, globalParams->_mspLimitPercent);
+ _mspManager.setParameters(globalParams->_match,
+ globalParams->_imismatch,
+ globalParams->_vmismatch,
+ globalParams->_percentError,
+ wordExtensionAllowance);
+ _mspManager.setExonSource(&_exonManager);
+
+ // wordSize -- for finding initial seeds
+ // wordSizeInt -- for extending seeds between seeds
+ // wordSizeExt -- for extending seeds on the ends
+ //
+ // If sim4parameters defined wordSizeInt or Ext use that,
+ // otherwise, use the original method.
+ //
+ wordSize = globalParams->_wordSize;
+ wordSizeInt = MIN(8, globalParams->_wordSize);
+ wordSizeExt = MIN(10, globalParams->_wordSize);
+
+ if (globalParams->_wordSizeInt) wordSizeInt = globalParams->_wordSizeInt;
+ if (globalParams->_wordSizeExt) wordSizeExt = globalParams->_wordSizeExt;
+
+ spacedSeed = globalParams->_spacedSeed;
+ spacedSeedInt = globalParams->_spacedSeedInt;
+ spacedSeedExt = globalParams->_spacedSeedExt;
+
+ spacedSeedMSS = mss_t(spacedSeed);
+ spacedSeedIntMSS = mss_t(spacedSeedInt);
+ spacedSeedExtMSS = mss_t(spacedSeedExt);
+
+ spliceModel = globalParams->_spliceModel;
+
+ if (!globalParams->_dontForceCanonicalSplicing)
+ splice_init(spliceModel);
+
+ mspThreshold1 = globalParams->_mspThresh1; //K;
+ mspThreshold2 = globalParams->_mspThresh2; //C;
+
+ G_score = 0;
+ C_score = 0;
+
+ exon_list = 0L;
+
+ mask = 0;
+
+ hashtable = 0L;
+
+ phashtable.nextPos = 0L;
+ phashtable.nodes = 0L;
+ phashtable.nodesused = 0;
+
+ thashtable.nextPos = 0L;
+ thashtable.nodes = 0L;
+ thashtable.nodesused = 0;
+
+ _accurateSequences = false;
+ };
+
+ ~Sim4() {
+ //pdumppalloc(_pallochandle);
+ pfree2(_pallochandle);
+ pfreehandle(_pallochandle);
+
+ delete [] phashtable.nextPos;
+ delete [] phashtable.nodes;
+
+ delete [] thashtable.nextPos;
+ delete [] thashtable.nodes;
+ };
+
+ sim4polishList *run(sim4command *cmd);
+
+ void *ckalloc(size_t size) {
+ return(palloc2(size, _pallochandle));
+ };
+
+ void ckfree(void *) {
+ };
+
+};
+
+
+
+
+#endif // SIM4_H
diff --git a/libsim4/sim4core/sim4b1-1.C b/libsim4/sim4core/sim4b1-1.C
new file mode 100644
index 0000000..54cd949
--- /dev/null
+++ b/libsim4/sim4core/sim4b1-1.C
@@ -0,0 +1,118 @@
+#include "sim4.H"
+
+
+// resolve overlap using the GT-AG criterion
+//
+int
+Sim4::resolve_overlap(Exon *tmp_block, Exon *tmp_block1, char *seq) {
+ int diff, best_u, l0, l1, u, cost;
+ int GTAG_score, CTAC_score;
+ char *s1, *s2, *e1;
+
+ diff = tmp_block1->frEST-tmp_block->toEST-1;
+ if (diff>=0)
+ return (tmp_block1->frEST-1);
+
+ /* u-1 = actual position in the sequence */
+
+ l0 = tmp_block->length-diff;
+ l1 = tmp_block1->length;
+
+ best_u = u = tmp_block1->frEST-1;
+ s1 = seq+tmp_block->toGEN-(tmp_block->toEST-u);
+ s2 = seq-2+tmp_block1->frGEN+u-tmp_block1->frEST;
+
+ cost = 0;
+ e1 = seq+tmp_block->toGEN;
+ while (s1<=e1) {
+ GTAG_score = CTAC_score = 0;
+ GTAG_score += ((char)(*s1)=='G') ? 1 : 0;
+ GTAG_score += ((char)(*(s1+1))=='T') ? 1 : 0;
+ GTAG_score += ((char)(*s2)=='A') ? 1 : 0;
+ GTAG_score += ((char)(*(s2+1))=='G') ? 1 : 0;
+
+ if (GTAG_score > abs(cost) && ((l0>=8) || (l1>=8))) {
+ cost = GTAG_score;
+ best_u = u;
+ if (cost == 4) break;
+ }
+
+ CTAC_score += ((char)(*s1)=='C') ? 1 : 0;
+ CTAC_score += ((char)(*(s1+1))=='T') ? 1 : 0;
+ CTAC_score += ((char)(*s2)=='A') ? 1 : 0;
+ CTAC_score += ((char)(*(s2+1))=='C') ? 1 : 0;
+
+ if (CTAC_score > abs(cost)) {
+ cost = -CTAC_score;
+ best_u = u;
+ if (cost == 4) break;
+ }
+
+ u++; s1++; s2++;
+ l0++; l1--;
+ }
+
+ return best_u;
+}
+
+
+int
+Sim4::SIM4_block1(Exon* &Lblock,
+ Exon* &tmp_block,
+ Exon* &tmp_block1) {
+ int rollbflag = 0;
+
+ // Try to resolve the overlap
+
+ int best_u = resolve_overlap(tmp_block,tmp_block1,_genSeq);
+
+ tmp_block1->frGEN += best_u + 1 - tmp_block1->frEST;
+ tmp_block1->frEST = best_u + 1;
+
+ //fprintf(stderr, "sim4_block1()-- Lblock=%p tmp_block=%p tmp_block1=%p\n", Lblock, tmp_block, tmp_block1);
+
+ // If the block is really short, remove it.
+
+ if (((tmp_block1->toEST - tmp_block1->frEST + 1) < 8) ||
+ ((tmp_block1->toGEN - tmp_block1->frGEN + 1) < 8)) {
+ tmp_block->next_exon = tmp_block1->next_exon;
+ tmp_block->flag = tmp_block1->flag;
+ rollbflag = 1;
+ //freeExon(tmp_block1); garbage collected
+ tmp_block1 = NULL;
+ }
+
+ tmp_block->toGEN -= tmp_block->toEST-best_u;
+ tmp_block->toEST = best_u;
+
+ if (((tmp_block->toEST - tmp_block->frEST + 1) < 8) ||
+ ((tmp_block->toGEN - tmp_block->frGEN + 1) < 8)) {
+ Exon *prev = find_previous(Lblock, tmp_block);
+
+ if (prev == 0L) {
+ fprintf(stderr, "SIM4_block1(): Corrupted exon list, cannot find the previous exon.\n");
+ for (; Lblock; Lblock = Lblock->next_exon)
+ if (tmp_block == Lblock)
+ fprintf(stderr, " GEN f=%8d t=%8d EST f=%8d t=%8d flag=%d <- tried to find previous of this one\n",
+ Lblock->frGEN, Lblock->toGEN, Lblock->frEST, Lblock->toEST, Lblock->flag);
+ else
+ fprintf(stderr, " GEN f=%8d t=%8d EST f=%8d t=%8d flag=%d\n",
+ Lblock->frGEN, Lblock->toGEN, Lblock->frEST, Lblock->toEST, Lblock->flag);
+ kill(getpid(), SIGKILL);
+ }
+
+ prev->next_exon = tmp_block->next_exon;
+ prev->flag = tmp_block->flag;
+ if ((tmp_block->toEST - tmp_block->frEST + 1) > 0)
+ rollbflag = 1;
+ //freeExon(tmp_block); garbage collected
+ tmp_block = prev;
+ }
+
+ if (tmp_block->toGEN)
+ tmp_block->length = tmp_block->toEST - tmp_block->frEST + 1;
+ if (tmp_block1 && tmp_block1->toGEN)
+ tmp_block1->length = tmp_block1->toEST - tmp_block1->frEST + 1;
+
+ return(rollbflag);
+}
diff --git a/libsim4/sim4core/sim4b1-2.C b/libsim4/sim4core/sim4b1-2.C
new file mode 100644
index 0000000..fe51fd2
--- /dev/null
+++ b/libsim4/sim4core/sim4b1-2.C
@@ -0,0 +1,84 @@
+#include "sim4.H"
+
+int
+Sim4::SIM4_block2(Exon* &tmp_Lblock,
+ Exon* &tmp_Rblock,
+ Exon* &tmp_block,
+ Exon* &tmp_block1) {
+ int cost;
+ int rollbflag = 0;
+
+ int diff = (int)(tmp_block1->frEST - tmp_block->toEST - 1);
+
+ //fprintf(stderr, "Called SIM4_block2()\n");
+
+ if (diff <= MAX_GRINIT) {
+ cost = greedy(_estSeq + tmp_block->toEST,
+ _genSeq + tmp_block->toGEN,
+ diff,
+ tmp_block1->frGEN-tmp_block->toGEN-1,
+ tmp_block->toEST,tmp_block->toGEN,
+ &tmp_Lblock, &tmp_Rblock);
+#if 0
+ printf("greedy returned cost %d (limit:%d)\n", cost, MAX(wordSize,(int)(globalParams->_percentError * diff + 1)));
+#endif
+ } else {
+ cost = MAX(wordSize,(int)(globalParams->_percentError * diff + 1))+1;
+ }
+
+ //PRINTEXONS("greedy\n", tmp_Lblock);
+
+ if (cost>MAX(wordSize,(int)(globalParams->_percentError * diff + 1))) {
+ if (!tmp_block->flag && !tmp_block1->flag) {
+ exon_cores(_genSeq+tmp_block->toGEN-1,
+ _estSeq+tmp_block->toEST-1,
+ tmp_block1->frGEN-tmp_block->toGEN-1,
+ diff,
+ tmp_block->toGEN+1,
+ tmp_block->toEST+1,
+ 1,
+ spacedSeedIntMSS,
+ mspThreshold2,
+ TEMP);
+
+ //PRINTEXONS("1\n", exon_list);
+
+ tmp_Lblock = tmp_Rblock = exon_list;
+ while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
+ tmp_Rblock = tmp_Rblock->next_exon;
+
+ if ((!tmp_Lblock && tmp_block1->frGEN-tmp_block->toGEN>50000) ||
+ (tmp_Lblock && (tmp_Lblock->frEST-tmp_block->toEST>100) &&
+ (tmp_Lblock->frGEN-tmp_block->frGEN>50000)) ||
+ (tmp_Lblock && (tmp_block1->frEST-tmp_Rblock->toEST>100) &&
+ (tmp_block1->frGEN-tmp_Rblock->frGEN>50000))) {
+ /* possible large intron; increase the score weight */
+ //freeExonList(tmp_Lblock); garbage collected
+
+ exon_list = _mspManager.doLinking(globalParams->_relinkWeight,
+ DEFAULT_DRANGE,
+ tmp_block->toGEN + 1,
+ tmp_block->toEST + 1,
+ 1,
+ true,
+ _genSeq, _estSeq);
+
+ //PRINTEXONS("1a\n", exon_list);
+
+ tmp_Lblock = tmp_Rblock = exon_list;
+ while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
+ tmp_Rblock = tmp_Rblock->next_exon;
+ }
+ _mspManager.clear();
+
+ if (tmp_Lblock)
+ rollbflag = 1;
+ else
+ rollbflag = 0; /* already 0 */
+ } else {
+ tmp_Lblock = tmp_Rblock = NULL;
+ }
+ }
+
+ return(rollbflag);
+}
diff --git a/libsim4/sim4core/sim4b1-3.C b/libsim4/sim4core/sim4b1-3.C
new file mode 100644
index 0000000..8abadbd
--- /dev/null
+++ b/libsim4/sim4core/sim4b1-3.C
@@ -0,0 +1,116 @@
+#include "sim4.H"
+
+int
+Sim4::SIM4_block3(bool good_match,
+ Exon* &tmp_Lblock,
+ Exon* &tmp_Rblock,
+ Exon* &tmp_block,
+ Exon* &tmp_block1) {
+ int I, J;
+ int rollbflag = 0;
+ int cost;
+
+ //fprintf(stderr, "Called SIM4_block3()\n");
+
+ /* start of seq; find last_AG, last_AC */
+ if (_accurateSequences)
+ findLastAGandAC(tmp_block1);
+
+ // These two blocks should do the same thing. The first one isn't readable.
+
+#if 0
+ int diff = (int)(tmp_block1->frEST - tmp_block->toEST - 1);
+ diff = (int)(MIN(diff,(int)(MAX_GRINIT/2)));
+
+ int u = MIN(4*diff,tmp_block1->frGEN-tmp_block->toGEN-1);
+ cost = EXTEND_BW(_estSeq+tmp_block->toEST+
+ (tmp_block1->frEST-tmp_block->toEST-1)-diff,
+ _genSeq+tmp_block->toGEN+
+ (tmp_block1->frGEN-tmp_block->toGEN-1)-u,
+ (int)diff, u,
+ tmp_block->toEST+
+ (tmp_block1->frEST-tmp_block->toEST-1)-diff,
+ tmp_block->toGEN+
+ (tmp_block1->frGEN-tmp_block->toGEN-1)-u,
+ &I, &J);
+#else
+ int diff = MIN(tmp_block1->frEST - tmp_block->toEST - 1, MAX_GRINIT/2);
+ int u = MIN(4*diff, tmp_block1->frGEN - tmp_block->toGEN - 1);
+
+ cost = EXTEND_BW(_estSeq + tmp_block1->frEST - 1 - diff,
+ _genSeq + tmp_block1->frGEN - 1 - u,
+ diff,
+ u,
+ tmp_block1->frEST - 1 - diff,
+ tmp_block1->frGEN - 1 - u,
+ &I,
+ &J);
+#endif
+
+
+ if ((good_match==0) || tmp_block->flag || (J==0) || (I==0)) {
+ tmp_block1->frEST = I+1;
+ tmp_block1->frGEN = J+1;
+ tmp_block1->edist += cost;
+ tmp_block1->length = tmp_block1->toEST-tmp_block1->frEST+1;
+ }
+
+ /* use blast if marginal gap still exists, and this is first scan */
+ if (!(diff=(int)(tmp_block1->frEST-tmp_block->toEST-1)) ||
+ tmp_block->flag) {
+ /* blast-treated region or no gap */
+ tmp_Rblock = tmp_Lblock = NULL;
+ } else {
+ exon_cores(_genSeq+tmp_block->toGEN-1,
+ _estSeq+tmp_block->toEST-1,
+ tmp_block1->frGEN-tmp_block->toGEN-1,
+ diff,
+ tmp_block->toGEN+1,
+ tmp_block->toEST+1,
+ 1,
+ spacedSeedExtMSS,
+ mspThreshold2,
+ TEMP);
+
+ //PRINTEXONS("2\n", exon_list);
+
+ tmp_block -> flag = 1;
+ tmp_Lblock = tmp_Rblock = exon_list;
+ while (tmp_Rblock && tmp_Rblock->next_exon)
+ tmp_Rblock = tmp_Rblock->next_exon;
+
+ if ((!tmp_Lblock && tmp_block1->frGEN-tmp_block->toGEN>50000) ||
+ (tmp_Lblock && (tmp_Lblock->frEST-tmp_block->toEST>100) &&
+ (tmp_Lblock->frGEN-tmp_block->frGEN>50000)) ||
+ (tmp_Lblock && (tmp_block1->frEST-tmp_Rblock->toEST>100) &&
+ (tmp_block1->frGEN-tmp_Rblock->frGEN>50000))) {
+ /* possible large intron; increase the score weight */
+ //freeExonList(tmp_Lblock); garbage collected
+
+ exon_list = _mspManager.doLinking(globalParams->_relinkWeight,
+ DEFAULT_DRANGE,
+ tmp_block->toGEN + 1,
+ tmp_block->toEST + 1,
+ 1,
+ true,
+ _genSeq, _estSeq);
+
+ //PRINTEXONS("2a\n", exon_list);
+
+ tmp_Lblock = tmp_Rblock = exon_list;
+ while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
+ tmp_Rblock = tmp_Rblock->next_exon;
+ }
+ _mspManager.clear();
+
+ if (tmp_Lblock) {
+ rollbflag = 1;
+ } else {
+ tmp_block1->frEST = I+1;
+ tmp_block1->frGEN = J+1;
+ tmp_block1->edist += cost;
+ tmp_block1->length = tmp_block1->toEST-tmp_block1->frEST+1;
+ }
+ }
+ return(rollbflag);
+}
diff --git a/libsim4/sim4core/sim4b1-4.C b/libsim4/sim4core/sim4b1-4.C
new file mode 100644
index 0000000..aca9a6a
--- /dev/null
+++ b/libsim4/sim4core/sim4b1-4.C
@@ -0,0 +1,121 @@
+#include "sim4.H"
+
+int
+Sim4::SIM4_block4(bool good_match,
+ Exon* &tmp_Lblock,
+ Exon* &tmp_Rblock,
+ Exon* &tmp_block,
+ Exon* &tmp_block1) {
+ int I, J;
+ int rollbflag = 0;
+ int cost;
+
+ //fprintf(stderr, "Called SIM4_block4()\n");
+
+ if (_accurateSequences)
+ findLastGTandCT(tmp_block);
+
+ // These two blocks should do the same thing. The first one isn't readable.
+
+#if 0
+ int diff = (int)(tmp_block1->frEST - tmp_block->toEST - 1);
+ diff = (int)(MIN(diff,(int)(MAX_GRINIT/2)));
+
+ cost = EXTEND_FW(_estSeq+tmp_block->toEST,
+ _genSeq+tmp_block->toGEN,
+ diff,
+ MIN(4*diff,tmp_block1->frGEN-tmp_block->toGEN-1),
+ tmp_block->toEST,tmp_block->toGEN,
+ &I, &J);
+#else
+ int diff = MIN(tmp_block1->frEST - tmp_block->toEST - 1, MAX_GRINIT/2);
+ int u = MIN(4*diff, tmp_block1->frGEN - tmp_block->toGEN - 1);
+
+ cost = EXTEND_FW(_estSeq + tmp_block->toEST,
+ _genSeq + tmp_block->toGEN,
+ diff,
+ u,
+ tmp_block->toEST,
+ tmp_block->toGEN,
+ &I,
+ &J);
+#endif
+
+ if ((good_match==0) || tmp_block1->flag || (I==_genLen) || (J==_estLen)) {
+ if (tmp_block->toGEN) {
+ tmp_block->toEST = I;
+ tmp_block->toGEN = J;
+ tmp_block->edist += cost;
+ tmp_block->length = tmp_block->toEST-tmp_block->frEST+1;
+ tmp_Rblock = tmp_Lblock = NULL;
+ } else
+ /* special case: no initial exon */
+ tmp_Lblock = tmp_Rblock = NULL;
+ }
+ //PRINTEXONS("tmp_block after if\n", tmp_block);
+ /* use blast if marginal gap still exists, and this is first scan */
+ if (!(diff=(int)(tmp_block1->frEST-tmp_block->toEST-1)) ||
+ tmp_block1->flag) {
+ /* blast-treated region or no gap */
+ tmp_Rblock = tmp_Lblock = NULL;
+ } else {
+ //PRINTEXONS("tmp_block\n", tmp_block);
+ //PRINTEXONS("tmp_block1\n", tmp_block);
+ exon_cores(_genSeq+tmp_block->toGEN-1,
+ _estSeq+tmp_block->toEST-1,
+ tmp_block1->frGEN-tmp_block->toGEN-1,
+ diff,
+ tmp_block->toGEN+1,
+ tmp_block->toEST+1,
+ 1,
+ spacedSeedExtMSS,
+ mspThreshold2,
+ TEMP);
+
+ //PRINTEXONS("3\n", exon_list);
+
+ tmp_Lblock = tmp_Rblock = exon_list;
+ while (tmp_Rblock && tmp_Rblock->next_exon)
+ tmp_Rblock = tmp_Rblock->next_exon;
+
+ if ((!tmp_Lblock && tmp_block1->frGEN-tmp_block->toGEN>50000) ||
+ (tmp_Lblock && (tmp_Lblock->frEST-tmp_block->toEST>100) &&
+ (tmp_Lblock->frGEN-tmp_block->frGEN>50000)) ||
+ (tmp_Lblock && (tmp_block1->frEST-tmp_Rblock->toEST>100) &&
+ (tmp_block1->frGEN-tmp_Rblock->frGEN>50000))) {
+ /* possible large intron; increase the score weight */
+ //freeExonList(tmp_Lblock); garbage collected
+
+ exon_list = _mspManager.doLinking(globalParams->_relinkWeight,
+ DEFAULT_DRANGE,
+ tmp_block->toGEN + 1,
+ tmp_block->toEST + 1,
+ 1,
+ true,
+ _genSeq, _estSeq);
+
+ //PRINTEXONS("3a\n", exon_list);
+
+ tmp_Lblock = tmp_Rblock = exon_list;
+ while ((tmp_Rblock!=NULL) && (tmp_Rblock->next_exon!=NULL))
+ tmp_Rblock = tmp_Rblock->next_exon;
+ }
+ _mspManager.clear();
+
+ tmp_block1->flag = 1;
+ if (tmp_Lblock) {
+ rollbflag = 1;
+ } else {
+ if (tmp_block->toGEN) {
+ tmp_block->toEST = I;
+ tmp_block->toGEN = J;
+ tmp_block->edist += cost;
+ tmp_block->length = tmp_block->toEST-tmp_block->frEST+1;
+ tmp_Rblock = tmp_Lblock = NULL;
+ } else
+ /* special case: no initial exon */
+ tmp_Lblock = tmp_Rblock = NULL;
+ }
+ }
+ return(rollbflag);
+}
diff --git a/libsim4/sim4core/sim4b1.C b/libsim4/sim4core/sim4b1.C
new file mode 100644
index 0000000..0300382
--- /dev/null
+++ b/libsim4/sim4core/sim4b1.C
@@ -0,0 +1,333 @@
+#include "sim4.H"
+
+#ifdef DEBUG_EXONS
+#define PRINTEXONS(S, L) (L)->printList(S)
+#else
+#define PRINTEXONS(S, L)
+#endif
+
+Sim4::edit_script_list *
+Sim4::SIM4(int *dist_ptr,
+ Exon **Exons,
+ int *pA,
+ int *pT,
+ sim4_stats_t *st) {
+
+ int rollbflag;
+ Exon *Lblock=0L, *tmp_Lblock=0L;
+ Exon *Rblock=0L, *tmp_Rblock=0L;
+ Exon *tmp_block=0L;
+ Exon *tmp_block1=0L;
+
+ *dist_ptr = 0;
+ *Exons = 0L;
+ *pA = 0;
+ *pT = 0;
+
+ //
+ // The call to exon_cores() that used to be here is now done in sim4string.
+ //
+
+ // See if there are too many MSPs found. If so, fail.
+ //
+ st->tooManyMSPs = false;
+ if (_mspManager.tooManyMSPs()) {
+ st->tooManyMSPs = true;
+ st->numberOfMatches = _mspManager.numberOfMSPs();
+ return(0L);
+ }
+
+ PRINTEXONS("initial exon set\n", exon_list);
+
+ tmp_block = Lblock = exon_list;
+ while (tmp_block) {
+ if (tmp_block->next_exon==NULL)
+ Rblock = tmp_block;
+ tmp_block = tmp_block->next_exon;
+ }
+
+ if (Lblock &&
+ ((Lblock->frGEN>50000 && Lblock->frEST>100) ||
+ ((_genLen - Rblock->toGEN > 50000) && (_estLen - Rblock->toEST > 100)))) {
+ //freeExonList(exon_list); garbage collected
+
+ exon_list = _mspManager.doLinking(globalParams->_relinkWeight,
+ DEFAULT_DRANGE,
+ 1,
+ 1,
+ 0,
+ true,
+ _genSeq, _estSeq);
+
+ PRINTEXONS("relink the initial stuff\n", exon_list);
+
+ tmp_block = Lblock = exon_list;
+ while (tmp_block) {
+ if (tmp_block->next_exon==NULL)
+ Rblock = tmp_block;
+ tmp_block = tmp_block->next_exon;
+ }
+ }
+ _mspManager.clear();
+
+ tmp_block = Lblock = exon_list;
+ while (tmp_block) {
+ if (tmp_block->next_exon==NULL)
+ Rblock = tmp_block;
+ tmp_block = tmp_block->next_exon;
+ }
+
+ PRINTEXONS("initial exon set after possibly relinking\n", exon_list);
+
+ /* enclose the current path in the (0,0,0,0) and (M+1,N+1,0,0) brackets */
+
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "exon bracket at start\n");
+#endif
+ Lblock = _exonManager.newExon(0,0,0,0,0,0,0,Lblock);
+ if (Rblock == NULL)
+ Rblock = Lblock;
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "exon bracket at end; Lblock = 0x%08lx, Rblock = 0x%08lx\n", Lblock, Rblock);
+#endif
+ Rblock->next_exon = _exonManager.newExon(_genLen+1,_estLen+1,0,0,0,0,0,NULL);
+
+ PRINTEXONS("initial exon set after inserting brackets\n", Lblock);
+
+ /* compute current statistics */
+ bool good_match = get_match_quality(Lblock, Rblock, st, _estLen);
+
+
+ PRINTEXONS("after get_match_quality\n", Lblock);
+
+
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "before big nasty while loop\n");
+#endif
+
+
+ tmp_block = Lblock;
+ while ((tmp_block1 = tmp_block->next_exon)!=NULL) {
+
+ PRINTEXONS("start of loop to fill in missing pieces\n", Lblock);
+
+ rollbflag = 0;
+
+ // This is the distance from this exon to the next exon
+ // in the EST
+ //
+ int diff = (int)(tmp_block1->frEST - tmp_block->toEST - 1);
+
+#ifdef SHOW_PROGRESS
+ fprintf(stdout, "tmp_block: %8d %8d %8d %8d %d diff=%d\n",
+ tmp_block->frGEN,
+ tmp_block->toGEN,
+ tmp_block->frEST,
+ tmp_block->toEST,
+ tmp_block->flag,
+ diff);
+#endif
+
+ if (diff) {
+
+ if (diff < 0) {
+ // If the diff is less than zero, then there is an overlap in
+ // the EST. Wobble the boundary using GTAG signals (so
+ // obviously, this won't work correctly if we are not cDNA).
+ //
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "Called SIM4_block1() with diff=%d\n", diff);
+#endif
+ rollbflag = SIM4_block1(Lblock, tmp_block, tmp_block1);
+ } else {
+
+ // Otherwise, there is a gap in the EST, and we need to fill
+ // it in. This is done only if there is no overlap in the
+ // genomic.
+ //
+ if (tmp_block1->frGEN - tmp_block->toGEN - 1 > 0) {
+ if (tmp_block1->toEST &&
+ tmp_block->toEST) {
+ // We are not the first or last gap -- an interior gap
+ // between two exons.
+ //
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "Called SIM4_block2()\n");
+#endif
+ rollbflag = SIM4_block2(tmp_Lblock,
+ tmp_Rblock,
+ tmp_block,
+ tmp_block1);
+ } else if (tmp_block1->toGEN) {
+ // Not the last gap, so must be the first gap.
+ //
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "Called SIM4_block3()\n");
+#endif
+ rollbflag = SIM4_block3(good_match,
+ tmp_Lblock,
+ tmp_Rblock,
+ tmp_block,
+ tmp_block1);
+ } else {
+ // By default, the last gap.
+ //
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "Called SIM4_block4()\n");
+#endif
+ rollbflag = SIM4_block4(good_match,
+ tmp_Lblock,
+ tmp_Rblock,
+ tmp_block,
+ tmp_block1);
+ }
+ } else {
+ // Overlapping genomic. What these do when set to
+ // NULL is unknown.
+ //
+ tmp_Rblock = tmp_Lblock = NULL;
+ }
+
+ // Merge block in the exon list; make connections to the
+ // previous list of blocks; maintain increasing order
+ //
+ if (tmp_Lblock) {
+ tmp_block->next_exon = tmp_Lblock;
+ tmp_Rblock->next_exon = tmp_block1;
+
+ PRINTEXONS("before merge tmp_block\n", tmp_block);
+ PRINTEXONS("before merge tmp_block1\n", tmp_block1);
+ PRINTEXONS("before merge tmp_Lblock\n", tmp_Lblock);
+ PRINTEXONS("before merge tmp_Rblock\n", tmp_Rblock);
+
+ merge(&tmp_block,&tmp_block1);
+ }
+ }
+ }
+
+ // If this exon block was not removed, move to the next. If it was removed,
+ // we're already there.
+ //
+ if (rollbflag == 0)
+ tmp_block = tmp_block1;
+ }
+
+
+ PRINTEXONS("all done -- final Lblock\n", Lblock);
+
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "sim4b1 -- before compact_list\n");
+#endif
+
+ /* compaction step; note: it resets the right end of the list to */
+ /* the last item in the block list */
+
+ compact_list(&(Lblock->next_exon), &Rblock, (globalParams->_interspecies ? SHORT_INTRON : wordSize));
+
+ if (globalParams->_interspecies)
+ filter(&Lblock, &Rblock);
+
+
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "sim4b1 -- before small block at start removal\n");
+#endif
+
+ /* eliminate marginal small blocks at the start of the sequence; */
+ /* resets the empty alignment to one block (Lblock) only */
+
+ tmp_block = Lblock->next_exon;
+
+ while ((tmp_block!=NULL) && (tmp_block->length<wordSize) && tmp_block->toGEN) {
+ tmp_block1 = tmp_block;
+ tmp_block = tmp_block->next_exon;
+ //freeExon(tmp_block1); garbage collected
+ }
+ Lblock->next_exon = tmp_block;
+
+ PRINTEXONS("all done -- after removing small blocks at the start\n", Lblock);
+
+ // eliminate marginal small blocks at the end of the sequence
+ // XXX: Yes, there is a leak here. That's why we garbage collect!
+
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "Rblock before end of list removal 0x%08lx\n", Rblock);
+#endif
+
+ Exon *last = Lblock->next_exon;
+ tmp_block = last;
+ while (tmp_block!=NULL) {
+ if (tmp_block->length>=wordSize)
+ last = tmp_block;
+ tmp_block = tmp_block->next_exon;
+ }
+ if (last && last->toGEN)
+ last->next_exon = Rblock->next_exon;
+ Rblock = last;
+
+#ifdef SHOW_PROGRESS
+ fprintf(stderr, "Rblock after end of list removal 0x%08lx\n", Rblock);
+#endif
+
+ PRINTEXONS("all done -- after removing small blocks at the end\n", Lblock);
+
+ /* if high accuracy requirement, adjust boundaries of marginal exons */
+ if (_accurateSequences)
+ adjustBoundariesOfMarginalExons(Lblock);
+
+ /* Slide exon boundaries for optimal intron signals */
+ if (globalParams->_slideIntrons) {
+ if (globalParams->_interspecies == 1) {
+ SLIDE_INTRON(MIN(15,MAX_SLIDE), Lblock->next_exon, Rblock, spliceModel, st, 1);
+ } else {
+ if (get_sync_flag(Lblock, Rblock, 6) == 1)
+ SLIDE_INTRON(6, Lblock->next_exon, Rblock, SPLICE_ORIGINAL, st, 1);
+ else
+ SLIDE_INTRON(6, Lblock->next_exon, Rblock, SPLICE_ORIGINAL, st, 0);
+ }
+ } else {
+ // Set orientation flag on introns to be unknown -- this has an
+ // undesired side effect of forcing the resulting match to have a
+ // strand orientation the same as the intron orientation (if one
+ // exon) instead of 'unknown'.
+ Exon *t0 = Lblock->next_exon;
+ Exon *t1 = NULL;
+
+ while (t0 && (t1=t0->next_exon) && t1->toGEN) {
+ t0->ori = 'E';
+ t0 = t1;
+ }
+ }
+
+ /* decreasingly; script will be in reverse order */
+
+ struct edit_script_list *Shead = NULL;
+
+ flip_list(&Lblock, &Rblock);
+ pluri_align(dist_ptr, Lblock, &Shead, st);
+ flip_list(&Lblock, &Rblock); /* increasingly */
+
+ *pT = 0;
+ *pA = 0;
+ if (Shead) {
+ if (globalParams->_ignorePolyTails) {
+ remove_polyT_front(&Shead, Lblock, _genSeq, _estSeq, pT);
+ remove_polyA_back(&Shead, Lblock, _genSeq, _estSeq, _estLen, pA);
+
+ if (*pA || *pT)
+ updateStatistics(Lblock, st);
+ }
+
+ get_stats(Lblock, st);
+
+ *Exons = Lblock->next_exon;
+ //freeExon(Lblock); garbage collected
+ } else {
+ *Exons = 0L;
+
+ //freeExonList(Lblock); garbage collected
+ }
+
+ // Memory leak when Script_head == 0L -- see pluri_align, too!
+
+ return(Shead);
+}
diff --git a/libsim4/sim4core/sim4b1_s.C b/libsim4/sim4core/sim4b1_s.C
new file mode 100644
index 0000000..08861a2
--- /dev/null
+++ b/libsim4/sim4core/sim4b1_s.C
@@ -0,0 +1,100 @@
+#include "sim4.H"
+#include "sim4b1_s.H"
+
+mss_t::mss_t(char seed[32]) {
+ position_t MP[64];
+
+ type = 0;
+ mask = 0;
+ masknum = 0;
+ seedLength = strlen(seed);
+ matchedLength = 0;
+
+ int total=0;
+ int maskSeedLength=0;
+ char seed_mask[2*seedLength+1];
+
+ for (int i=0;i<seedLength;i++){
+ if(seed[i] == '0') {
+ seed_mask[2*i] = '0';
+ seed_mask[2*i+1] = '0';
+ }
+ else if(seed[i] == 'x') {
+ seed_mask[2*i] = '0';
+ seed_mask[2*i+1] = '1';
+ }
+ else if(seed[i] == '1') {
+ seed_mask[2*i] = '1';
+ seed_mask[2*i+1] = '1';
+ }
+ else {
+ printf("The seed can only contain 0, 1, or x, or any other characters\n");
+ exit(1);
+ }
+ }
+ seed_mask[2*seedLength] = '\0';
+ maskSeedLength = strlen(seed_mask);
+
+ for (int i=0;i<maskSeedLength;i++) {
+ if(seed_mask[i] == '1') matchedLength++;
+ }
+ if(seed_mask[0] == '1') masknum = 1;
+
+ for (int i=1;i<maskSeedLength;i++) {
+ if(seed_mask[i] == '1' && seed_mask[i-1]!='1') {
+ masknum++;
+ continue;
+ }
+ }
+
+ assert(masknum <= 64);
+
+ mask = (uint64ONE << (seedLength+seedLength-2)) - 1;
+
+#ifdef DEBUG
+ printf(uint64HEX, mask);
+ printf("\n");
+#endif
+
+ int k=0;
+ if(seed_mask[0] == '1') MP[masknum-1].end = maskSeedLength - 1;
+
+ for (int i=0;i<maskSeedLength-1;i++){
+ if(seed_mask[i]!= '1' && seed_mask[i+1] == '1') MP[masknum - k-1].end = maskSeedLength - (i+1) -1 ;
+ if(seed_mask[i] == '1' && seed_mask[i+1] != '1') {
+ MP[masknum-k-1].begin = maskSeedLength -i-1 ;
+ k++;
+ }
+ }
+ if(seed_mask[maskSeedLength-1] == '1') MP[0].begin = 0;
+ if(seed_mask[maskSeedLength-1] == '1' && seed_mask[maskSeedLength-2]!= '1' ) MP[0].begin = 0;
+
+ for (int i=0;i<masknum;i++){
+ MP[i].width = MP[i].end - MP[i].begin + 1;
+ total = 0;
+ for(k=0;k<i;k++){
+ total = total + MP[k].width;
+ }
+ MP[i].result_shifts = MP[i].begin - total;
+ }
+
+
+ for (int i=0;i<masknum;i++){
+ masks[i] = ( (uint64ONE << MP[i].width) - uint64ONE) << MP[i].begin;
+ shifts[i] = MP[i].result_shifts;
+ }
+
+ type = ((2*seedLength == matchedLength) ? CONTINUOUS_SEED : SPACED_SEED);
+}
+
+
+uint64
+mss_t::mask_shift(uint64 ecode) {
+ uint64 masked_ecode = 0;
+
+ for (int i=0; i<masknum; i++)
+ masked_ecode += (ecode & masks[i]) >> shifts[i];
+
+ return(masked_ecode);
+}
+
diff --git a/libsim4/sim4core/sim4b1_s.H b/libsim4/sim4core/sim4b1_s.H
new file mode 100644
index 0000000..6654e12
--- /dev/null
+++ b/libsim4/sim4core/sim4b1_s.H
@@ -0,0 +1,32 @@
+#ifndef SIM4B1_S_H
+#define SIM4B1_S_H
+
+#define CONTINUOUS_SEED 10
+#define SPACED_SEED 11
+
+struct position_t {
+ int begin;
+ int end;
+ int width;
+ int result_shifts;
+};
+
+class mss_t {
+public:
+ mss_t() {};
+ mss_t(char seed[32]);
+ ~mss_t() {};
+
+ int type;
+ uint64 mask;
+ int masknum;
+ int seedLength;
+ int matchedLength;
+ uint64 masks[64]; // Fails assert in sim4b1_s.C if exceeded
+ int shifts[64];
+
+ uint64 mask_shift(uint64 ecode);
+};
+
+#endif
+
diff --git a/libsim4/sim4core/sim4b1a.C b/libsim4/sim4core/sim4b1a.C
new file mode 100644
index 0000000..e7ad7e5
--- /dev/null
+++ b/libsim4/sim4core/sim4b1a.C
@@ -0,0 +1,102 @@
+#include "sim4.H"
+
+
+void
+Sim4::adjustBoundariesOfMarginalExons(Exon *Lblock) {
+ coords *sig;
+ char tmp[50];
+ Exon *newthing;
+ Exon *tmp_block = Lblock->next_exon;
+
+ /* condition for non-signal */
+ if (tmp_block && tmp_block->toGEN &&
+ (strncmp((char *)(_genSeq+tmp_block->frGEN-3), END_SIG, (size_t)2) ||
+ (tmp_block->frEST!=1))) {
+ sig = (G_score>=abs(C_score)) ? &last_AG : &last_AC;
+ if (sig->pos1 && (sig->pos2<=20)) {
+ /* generated in extend_bw */
+ assert(sig->pos2 > 1);
+ (void)strcpy((char *)tmp,END_SIG);
+ (void)strncpy((char *)(tmp+2),(char *)_estSeq,(size_t)sig->pos2-1);
+ (void)strcpy((char *)(tmp+sig->pos2+1), START_SIG);
+ newthing = bmatch(_genSeq,tmp,tmp_block->frGEN-3,sig->pos2+3,1,1);
+ if (newthing) {
+ Lblock->next_exon->frGEN = sig->pos1;
+ Lblock->next_exon->frEST = sig->pos2;
+ Lblock->next_exon->length -= sig->pos2-1;
+ newthing->next_exon = Lblock->next_exon;
+ newthing->ori = (G_score>=abs(C_score)) ? 'G' : 'C';
+ Lblock->next_exon = newthing;
+ }
+ }
+ }
+
+ while (tmp_block && tmp_block->next_exon && tmp_block->next_exon->toGEN)
+ tmp_block = tmp_block->next_exon;
+ if (tmp_block && tmp_block->toGEN &&
+ (strncmp((char *)(_genSeq+tmp_block->toGEN),START_SIG,(size_t)2) || (tmp_block->toEST!=_estLen))) {
+ sig = (G_score>=abs(C_score)) ? &last_GT : &last_CT;
+ if (sig->pos1 && (_estLen-sig->pos2<=20)) {
+ assert(_estLen-sig->pos2 >= 0);
+ (void)strcpy((char *)tmp,END_SIG);
+ (void)strncpy((char *)(tmp+2),(char *)(_estSeq+sig->pos2),
+ (size_t)_estLen-sig->pos2);
+ (void)strcpy((char *)(tmp+_estLen-sig->pos2+2),START_SIG);
+ newthing = fmatch(_genSeq+sig->pos1-1,tmp,
+ _genLen-sig->pos1+1,_estLen-sig->pos2+4,
+ sig->pos1-1,sig->pos2+1);
+ if (newthing) {
+ tmp_block->toGEN = sig->pos1;
+ tmp_block->toEST = sig->pos2;
+ newthing->next_exon = tmp_block->next_exon;
+ tmp_block->next_exon = newthing;
+ tmp_block->ori = (G_score>=abs(C_score)) ? 'G' : 'C';
+ }
+ }
+ }
+}
+
+
+
+
+void
+Sim4::findLastAGandAC(Exon *tmp_block1) {
+ int v;
+
+ for (v=tmp_block1->frGEN-1; v<=tmp_block1->toGEN-3; v++)
+ if (!strncmp((char *)(_genSeq+v-2),"AG",(size_t)2)) {
+ last_AG.pos1 = v+1;
+ last_AG.pos2 = tmp_block1->frEST + (v-tmp_block1->frGEN)+1;
+ break;
+ }
+
+ for (v=tmp_block1->frGEN-1; v<=tmp_block1->toGEN-3; v++)
+ if (!strncmp((char *)(_genSeq+v-2),"AC",(size_t)2)) {
+ last_AC.pos1 = v+1;
+ last_AC.pos2 = tmp_block1->frEST + (v-tmp_block1->frGEN)+1;
+ break;
+ }
+}
+
+
+
+
+void
+Sim4::findLastGTandCT(Exon *tmp_block) {
+ int v;
+
+ for (v=tmp_block->toGEN; v>=tmp_block->frGEN; v--)
+ if (!strncmp((char *)(_genSeq+v),"GT",(size_t)2)) {
+ last_GT.pos1 = v;
+ last_GT.pos2 = tmp_block->toEST-(tmp_block->toGEN-v);
+ break;
+ }
+
+ for (v=tmp_block->toGEN; v>=tmp_block->frGEN; v--)
+ if (!strncmp((char *)(_genSeq+v),"CT",(size_t)2)) {
+ last_CT.pos1 = v;
+ last_CT.pos2 = tmp_block->toEST-(tmp_block->toGEN-v);
+ break;
+ }
+}
+
diff --git a/libsim4/sim4core/sim4command.C b/libsim4/sim4core/sim4command.C
new file mode 100644
index 0000000..6b2bb2a
--- /dev/null
+++ b/libsim4/sim4core/sim4command.C
@@ -0,0 +1,282 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "sim4command.H"
+
+#include <algorithm>
+
+using namespace std;
+
+
+// Run a single EST against a genomic range
+//
+// XXX: We should pull out the EST and GEN from the seqCache,
+// and store them as the "two char*" method.
+//
+sim4command::sim4command(uint32 ESTid,
+ seqCache *ESTs,
+ uint32 GENid,
+ uint32 GENlo,
+ uint32 GENhi,
+ seqCache *GENs,
+ bool doFor,
+ bool doRev) {
+
+ _estIdx = ESTid;
+
+ _ESTs = ESTs;
+ _ESTloaded = 0L;
+ _ESTsequence = 0L;
+ _ESTsequenceLength = 0;
+
+ _genIdx = GENid;
+ _genLo = GENlo;
+ _genHi = GENhi;
+
+ _GENs = GENs;
+ _GENloaded = 0L;
+ _GENsequence = 0L;
+ _GENsequenceLength = 0;
+
+ _doForward = doFor;
+ _doReverse = doRev;
+
+ _externalSeedsLen = 0;
+ _externalSeedsMax = 0;
+ _externalSeeds = 0L;
+}
+
+
+sim4command::sim4command(seqInCore *EST,
+ seqInCore *GEN,
+ uint32 GENlo,
+ uint32 GENhi,
+ bool doFor,
+ bool doRev) {
+
+ _estIdx = EST->getIID();
+
+ _ESTs = 0L;
+ _ESTloaded = EST;
+ _ESTsequence = 0L;
+ _ESTsequenceLength = 0;
+
+ _genIdx = GEN->getIID();
+ _genLo = GENlo;
+ _genHi = GENhi;
+
+ _GENs = 0L;
+ _GENloaded = GEN;
+ _GENsequence = 0L;
+ _GENsequenceLength = 0;
+
+ _doForward = doFor;
+ _doReverse = doRev;
+
+ _externalSeedsLen = 0;
+ _externalSeedsMax = 0;
+ _externalSeeds = 0L;
+}
+
+
+// Use two char*'s for sequence sources
+//
+sim4command::sim4command(char *EST,
+ uint32 ESTlen,
+ char *GEN,
+ uint32 GENlen,
+ uint32 GENlo,
+ uint32 GENhi,
+ bool doFor,
+ bool doRev) {
+ _estIdx = 0;
+
+ _ESTs = 0L;
+ _ESTloaded = 0L;
+ _ESTsequence = EST;
+ _ESTsequenceLength = ESTlen;
+
+ _genIdx = 0;
+ _genLo = GENlo;
+ _genHi = GENhi;
+
+ _GENs = 0L;
+ _GENloaded = 0L;
+ _GENsequence = GEN;
+ _GENsequenceLength = GENlen;
+
+ _doForward = doFor;
+ _doReverse = doRev;
+
+ _externalSeedsLen = 0;
+ _externalSeedsMax = 0;
+ _externalSeeds = 0L;
+}
+
+
+sim4command::~sim4command() {
+ if (_ESTs)
+ delete _ESTloaded;
+ if (_GENs)
+ delete _GENloaded;
+
+ delete [] _externalSeeds;
+}
+
+
+// Make absolutely sure that the genomic sequence start and end
+// positions are within the actual sequence. Ideally, this should
+// be checked by whatever generates the input, but it probably
+// isn't.
+//
+// If the end position is too big, make it the same as the sequence
+// length.
+//
+// If the start position is bigger than the (corrected) end
+// position, make it 100K less than the end position.
+//
+// This has the side-effect of loading the genomic sequence.
+//
+void
+sim4command::finalize(void) {
+
+ if (_genHi > getGENlength())
+ _genHi = getGENlength();
+
+ if (_genLo > _genHi)
+ if (_genHi > 100000)
+ _genLo = _genHi - 100000;
+ else
+ _genLo = 0;
+}
+
+
+
+// get() routines have multple cases
+//
+// if no fastaBase, they can quickly return
+// otherwise
+// if nothing loaded or the thing loaded isn't right:
+// delete the current
+// load the correct
+//
+
+void
+sim4command::loadEST(void) {
+ if ((_ESTloaded == 0L) ||
+ (_ESTloaded->getIID() != _estIdx)) {
+ delete _ESTloaded;
+ _ESTloaded = _ESTs->getSequenceInCore(_estIdx);
+ }
+}
+
+
+uint32
+sim4command::getESTidx(void) {
+ if (_ESTsequence)
+ return(0);
+ return(_estIdx);
+}
+
+char*
+sim4command::getESTheader(void) {
+ static char *xxx = "anonymous cDNA sequence";
+ if (_ESTsequence)
+ return(xxx);
+ loadEST();
+ return(_ESTloaded->header());
+}
+
+char*
+sim4command::getESTsequence(void) {
+ if (_ESTsequence)
+ return(_ESTsequence);
+ loadEST();
+ return(_ESTloaded->sequence());
+}
+
+uint32
+sim4command::getESTlength(void) {
+ if (_ESTsequence)
+ return(_ESTsequenceLength);
+ loadEST();
+ return(_ESTloaded->sequenceLength());
+}
+
+
+
+
+
+void
+sim4command::loadGEN(void) {
+ if ((_GENloaded == 0L) ||
+ (_GENloaded->getIID() != _genIdx)) {
+ delete _GENloaded;
+ _GENloaded = _GENs->getSequenceInCore(_genIdx);
+ }
+}
+
+char*
+sim4command::getGENheader(void) {
+ char *xxx = "anonymous genomic sequence";
+ if (_GENsequence)
+ return(xxx);
+ loadGEN();
+ return(_GENloaded->header());
+}
+
+char*
+sim4command::getGENsequence(void) {
+ if (_GENsequence)
+ return(_GENsequence);
+ loadGEN();
+ return(_GENloaded->sequence());
+}
+
+uint32
+sim4command::getGENlength(void) {
+ if (_GENsequence)
+ return(_GENsequenceLength);
+ loadGEN();
+ return(_GENloaded->sequenceLength());
+}
+
+
+
+////////////////////////////////////////
+//
+// This expects base-based seeds.
+// This expects that the position of the seed is the base in the seed.
+// This expects that GENpos is relative to the genomic subsequence.
+//
+// If reverse-complement match, the EST is reversed, the GEN is forward.
+//
+void
+sim4command::addSeed(uint32 GENpos, uint32 ESTpos, uint32 length) {
+
+ if (_externalSeedsLen >= _externalSeedsMax) {
+ if (_externalSeedsMax == 0)
+ _externalSeedsMax = 256;
+ _externalSeedsMax *= 2;
+ externalSeed *n = new externalSeed [_externalSeedsMax];
+ memcpy(n, _externalSeeds, sizeof(externalSeed) * _externalSeedsLen);
+ delete [] _externalSeeds;
+ _externalSeeds = n;
+ }
+
+ _externalSeeds[_externalSeedsLen]._GENposition = GENpos;
+ _externalSeeds[_externalSeedsLen]._ESTposition = ESTpos;
+ _externalSeeds[_externalSeedsLen]._length = length;
+
+ // fprintf(stderr, "sim4command::addSeed()-- GEN="uint32FMT" EST="uint32FMT" of length "uint32FMT"\n", GENpos, ESTpos, length);
+
+ _externalSeedsLen++;
+}
+
+
+
+void
+sim4command::sortExternalSeeds(void) {
+ sort(_externalSeeds, _externalSeeds + _externalSeedsLen);
+}
diff --git a/libsim4/sim4core/sim4command.H b/libsim4/sim4core/sim4command.H
new file mode 100644
index 0000000..acac305
--- /dev/null
+++ b/libsim4/sim4core/sim4command.H
@@ -0,0 +1,148 @@
+#ifndef SIM4COMMAND_H
+#define SIM4COMMAND_H
+
+#include "bio++.H"
+
+#include "seqCache.H"
+
+//
+// Contains the variable stuff for an execution of sim4
+// access to sequences (via seqCache)
+// genomic iid
+// genomic range
+// est iid (maybe more than one)
+// forward only
+// reverse only
+//
+
+class sim4command {
+public:
+
+ // Run a single EST against a genomic range
+ //
+ sim4command(uint32 ESTid,
+ seqCache *ESTs,
+ uint32 GENid,
+ uint32 GENlo,
+ uint32 GENhi,
+ seqCache *GENs,
+ bool doForward,
+ bool doReverse);
+
+ // Single EST against a genomic range, using alternative
+ // interface.
+ //
+ sim4command(seqInCore *EST,
+ seqInCore *GEN,
+ uint32 GENlo,
+ uint32 GENhi,
+ bool doForward,
+ bool doReverse);
+
+ // Use two char*'s for sequence sources -- both sequence deflines
+ // and iid's are undefined!
+ //
+ sim4command(char *EST,
+ uint32 ESTlen,
+ char *GEN,
+ uint32 GENlen,
+ uint32 GENlo,
+ uint32 GENhi,
+ bool doForward,
+ bool doReverse);
+
+ ~sim4command();
+
+ // These methods allow the initial seed detection to be
+ // done outside Sim4::run(). If used:
+ // each seed is extended as before.
+ // for interative alignments, seeds are masked out
+ //
+
+ // addSeed() takes coordinates relative to the start of the GEN
+ // sequence supplied to the constructor.
+ //
+ void addSeed(uint32 GENpos, uint32 ESTpos, uint32 length);
+
+ void sortExternalSeeds(void);
+ bool externalSeedsExist(void) { return(_externalSeedsLen > 0); };
+ uint32 numberOfExternalSeeds(void) { return(_externalSeedsLen); };
+ uint32 externalSeedESTPosition(uint32 i) { return(_externalSeeds[i]._ESTposition); };
+ uint32 externalSeedGENPosition(uint32 i) { return(_externalSeeds[i]._GENposition); };
+ uint32 externalSeedLength(uint32 i) { return(_externalSeeds[i]._length); };
+ void maskExternalSeed(uint32 i) { _externalSeeds[i]._length = 0; };
+
+
+ // Load the sequences, make some checks. This isn't done in the
+ // constructor so that it is possible to make a big list of
+ // commands, then give them to a processor. If we loaded all the
+ // genomics at creation....
+ //
+ void finalize(void);
+
+ void setForward(bool x) { _doForward = x; };
+ void setReverse(bool x) { _doReverse = x; };
+
+ bool doForward(void) { return(_doForward); };
+ bool doReverse(void) { return(_doReverse); };
+
+ void setGenomic(uint32 idx, uint32 lo, uint32 hi) {
+ _genIdx = idx;
+ _genLo = lo;
+ _genHi = hi;
+ };
+
+ uint32 getESTidx();
+ char *getESTheader();
+ char *getESTsequence();
+ uint32 getESTlength();
+
+ uint32 getGENidx(void) { return(_genIdx); };
+ uint32 getGENlo(void) { return(_genLo); };
+ uint32 getGENhi(void) { return(_genHi); };
+ char *getGENheader(void);
+ char *getGENsequence(void);
+ uint32 getGENlength(void);
+private:
+ void loadEST(void);
+ void loadGEN(void);
+
+ uint32 _estIdx;
+
+ seqCache *_ESTs;
+ seqInCore *_ESTloaded;
+ char *_ESTsequence;
+ uint32 _ESTsequenceLength; // valid only for _ESTsequence
+
+ uint32 _genIdx;
+ uint32 _genLo;
+ uint32 _genHi;
+
+ seqCache *_GENs;
+ seqInCore *_GENloaded;
+ char *_GENsequence;
+ uint32 _GENsequenceLength;
+
+ bool _doForward;
+ bool _doReverse;
+
+ // For external seeding
+ //
+ class externalSeed {
+ public:
+ uint32 _GENposition;
+ uint32 _ESTposition;
+ uint32 _length;
+
+ bool operator<(const externalSeed &that) const {
+ return(_GENposition < that._GENposition);
+ };
+ };
+
+ uint32 _externalSeedsLen;
+ uint32 _externalSeedsMax;
+ externalSeed *_externalSeeds;
+};
+
+
+#endif // SIM4COMMAND_H
diff --git a/libsim4/sim4core/sim4defines.H b/libsim4/sim4core/sim4defines.H
new file mode 100644
index 0000000..eef3b32
--- /dev/null
+++ b/libsim4/sim4core/sim4defines.H
@@ -0,0 +1,50 @@
+// Don't define this (unless your name starts with L or B).
+//
+// Changes to here, sim4db.H and exon_cores.H
+//
+//#define INTERSPECIES
+
+#define DIST_CUTOFF 3
+
+#define MIN_INTRON 30
+#define SHORT_INTRON 50
+#define LONG_INTRON 20000
+#define SHORT_EXON 40
+#define MAX_GRINIT 500
+#define MAX_SLIDE 15
+#define MAX_INTERNAL_GAP 50
+
+#define DEFAULT_DRANGE 10
+#define DEFAULT_WEIGHT 100
+#define DEFAULT_RELINK_WEIGHT 500
+
+#define DEFAULT_K 16
+#define DEFAULT_C 12
+
+#ifndef MIN
+#define MIN(x,y) ((x>y) ? (y):(x))
+#endif
+
+#ifndef MAX
+#define MAX(x,y) ((x<y) ? (y):(x))
+#endif
+
+#define START_SIG ((G_score >= abs(C_score)) ? "GT" : "CT")
+#define END_SIG ((G_score >= abs(C_score)) ? "AG" : "AC")
+
+#define DELETE 1
+#define INSERT 2
+#define SUBSTITUTE 3
+#define INTRON 4
+#define O_INTRON 5
+
+#define SPLICE_ORIGINAL 0
+#define SPLICE_GENESPLICER 1
+#define SPLICE_GLIMMER 2
+#define DEFAULT_SPLICE_MODEL SPLICE_ORIGINAL
+
+#define DEFAULT_SPACED_SEED "1xx1011011011xx11"
+#define DEFAULT_SPACED_SEED_INT "10011010100011"
+#define DEFAULT_SPACED_SEED_EXT "1101100011010111"
+#define SEED_SPAN 40 /* 22 */
+
diff --git a/libsim4/sim4core/sim4parameters.C b/libsim4/sim4core/sim4parameters.C
new file mode 100644
index 0000000..ad2cebc
--- /dev/null
+++ b/libsim4/sim4core/sim4parameters.C
@@ -0,0 +1,56 @@
+#include <pthread.h>
+#include "sim4parameters.H"
+#include "sim4defines.H"
+#include "../sim4polish/sim4polish.H"
+
+sim4parameters::sim4parameters() {
+ _findAllExons = false;
+ _minCoverage = 0.0;
+ _minCoverageLength = 0;
+ _minPercentExonIdentity = 0;
+
+ _includeDefLine = true;
+ _printAlignments = false;
+
+ _alwaysReport = 0;
+
+ _ignorePolyTails = true;
+ _polyTailPercent = 0.60;
+
+ _mspThresh1 = 0;
+ _mspThresh2 = 0;
+
+ _mspLimitAbsolute = 0;
+ _mspLimitPercent = 0.0;
+
+ _relinkWeight = DEFAULT_RELINK_WEIGHT;
+
+ _wordSize = 12;
+ _wordSizeInt = 8;
+ _wordSizeExt = 10;
+
+ _dontForceCanonicalSplicing = false;
+ _forceStrandPrediction = false;
+
+ _slideIntrons = true;
+
+ strcpy(_spacedSeed, "111111111111");
+ strcpy(_spacedSeedInt, "11111111");
+ strcpy(_spacedSeedExt, "1111111111");
+
+ _isSetSpacedSeed = false;
+
+ _spliceModel = DEFAULT_SPLICE_MODEL;
+ _isSetSpliceModel = false;
+
+ _interspecies = false;
+ _style = sim4polishStyleDefault;
+ _percentError = 0.20;
+ _match = 1;
+ _imismatch = -5;
+ _vmismatch = -5;
+}
+
+sim4parameters::~sim4parameters() {
+ pthread_mutex_destroy(&_splice_mutex);
+}
diff --git a/libsim4/sim4core/sim4parameters.H b/libsim4/sim4core/sim4parameters.H
new file mode 100644
index 0000000..be1166b
--- /dev/null
+++ b/libsim4/sim4core/sim4parameters.H
@@ -0,0 +1,208 @@
+#ifndef SIM4_PARAMETERS_H
+#define SIM4_PARAMETERS_H
+
+#include "mspManager.H"
+#include "sim4defines.H"
+#include "../sim4polish/sim4polish.H"
+
+#define SPACED_SEED_MAX_LEN 64
+
+class sim4parameters {
+public:
+ sim4parameters();
+ ~sim4parameters();
+
+ void setFindAllExons(bool x=true) {
+ _findAllExons = x;
+ };
+ void setMinCoverage(double x) {
+ _minCoverage = x;
+ };
+ void setMinCoverageLength(int l) {
+ _minCoverageLength = l;
+ };
+ void setMinPercentExonIdentity(int l) {
+ _minPercentExonIdentity = l;
+ };
+ void setIncludeDefLine(bool x=true) {
+ _includeDefLine = x;
+ };
+ void setPrintAlignments(bool x=true) {
+ _printAlignments = x;
+ };
+
+ void setAlwaysReport(int n) {
+ _alwaysReport = n;
+ };
+
+ void setIgnorePolyTails(bool x) {
+ _ignorePolyTails = x;
+ };
+ void setPolyTailPercent(double x) {
+ _polyTailPercent = x;
+ };
+
+ void setMSPThreshold1(int t) {
+ _mspThresh1 = t;
+ };
+ void setMSPThreshold2(int t) {
+ _mspThresh2 = t;
+ };
+
+ void setMSPLimitAbsolute(uint32 t) {
+ _mspLimitAbsolute = t;
+ };
+ void setMSPLimitPercent(double p) {
+ _mspLimitPercent = p;
+ };
+
+ void setRelinkWeight(int x) {
+ _relinkWeight = x;
+ };
+
+ void setWordSize(int w) {
+ _wordSize = w;
+ };
+ void setWordSizeInt(int w) {
+ _wordSizeInt = w;
+ };
+ void setWordSizeExt(int w) {
+ _wordSizeExt = w;
+ };
+
+ void setSpacedSeed(char *z) {
+ assert(strlen(z) < SPACED_SEED_MAX_LEN);
+ strcpy(_spacedSeed, z);
+ _isSetSpacedSeed = true;
+ };
+
+ void setSpliceModel(int j) {
+ _spliceModel = j;
+ _isSetSpliceModel = true;
+ };
+
+ void setDontForceCanonicalSplicing(bool x=true) {
+ _dontForceCanonicalSplicing = x;
+ };
+
+ void setForceStrandPrediction(bool x=true) {
+ _forceStrandPrediction = x;
+ };
+
+ void setSlideIntrons(bool x=true) {
+ _slideIntrons = x;
+ };
+
+ void setInterspecies(bool x=true) {
+ _interspecies = x;
+
+ if (_interspecies) {
+ _percentError = 0.45;
+ _match = 1;
+ _imismatch = -1;
+ _vmismatch = -3;
+
+ if (_isSetSpliceModel == false)
+ _spliceModel = SPLICE_GENESPLICER;
+
+ if (_isSetSpacedSeed == false)
+ strcpy(_spacedSeed, DEFAULT_SPACED_SEED);
+ strcpy(_spacedSeedInt, DEFAULT_SPACED_SEED_INT);
+ strcpy(_spacedSeedExt, DEFAULT_SPACED_SEED_EXT);
+
+ } else {
+ _percentError = 0.20;
+ _match = 1;
+ _imismatch = -5;
+ _vmismatch = -5;
+ }
+ }
+
+ void setOutputFormat(int styleCode) {
+ switch (styleCode) {
+ case S4P_POLISH_S4DB: _style = sim4polishS4DB; break;
+ case S4P_POLISH_GFF3: _style = sim4polishGFF3; break;
+ case S4P_POLISH_ATAC: _style = sim4polishS4DB; break; // Not yet implemented
+ default: fprintf(stderr, "sim4parameters::setOutputFormat() error: unrecognized output format; re-setting to default.\n");
+ }
+ }
+
+ int setSpliceMutex(void) {
+ return pthread_mutex_init(&_splice_mutex,NULL);
+ }
+
+ sim4polishStyle getOutputFormat(void) {
+ return _style;
+ }
+
+private:
+ double _minCoverage;
+ int _minCoverageLength;
+ int _minPercentExonIdentity;
+
+ int _alwaysReport;
+
+ bool _findAllExons;
+ bool _includeDefLine;
+ bool _printAlignments;
+
+ bool _dontForceCanonicalSplicing;
+ bool _forceStrandPrediction;
+
+ bool _ignorePolyTails;
+ double _polyTailPercent;
+
+ int _mspThresh1;
+ int _mspThresh2;
+
+ // For aborting expensive polishes
+ //
+ double _mspLimitPercent; // Same as below, as percentage of length of cDNA
+ uint32 _mspLimitAbsolute; // Number of MSPs allowed per hit
+
+ int _relinkWeight;
+
+ int _wordSize;
+ int _wordSizeInt;
+ int _wordSizeExt;
+
+ char _spacedSeed[SPACED_SEED_MAX_LEN];
+ char _spacedSeedInt[SPACED_SEED_MAX_LEN];
+ char _spacedSeedExt[SPACED_SEED_MAX_LEN];
+ bool _isSetSpacedSeed;
+
+ int _spliceModel;
+ bool _isSetSpliceModel;
+
+ pthread_mutex_t
+ _splice_mutex;
+
+ // Interspecies comparison options.
+ // _percentError is the former #defined P
+ // _match is the former MATCH
+ // _misMatch is the former MISMATCH
+ //
+
+ bool _slideIntrons;
+
+ // Interspecies comparison options.
+ // _percentError is the former #defined P
+ // _match is the former MATCH
+ // _misMatch is the former MISMATCH
+ //
+ bool _interspecies;
+
+ sim4polishStyle _style;
+
+ double _percentError;
+ int _match;
+ int _imismatch;
+ int _vmismatch;
+ int _imatchdiff;
+ int _vmatchdiff;
+
+ friend class Sim4;
+};
+
+
+#endif // SIM4_PARAMETERS_H
diff --git a/libsim4/sim4core/sim4string.C b/libsim4/sim4core/sim4string.C
new file mode 100644
index 0000000..cfee04b
--- /dev/null
+++ b/libsim4/sim4core/sim4string.C
@@ -0,0 +1,887 @@
+#include "sim4.H"
+#include "sim4polishBuilder.H"
+
+//#define SHOW_OVERLAPPING_EXONS
+
+
+static void
+add_offset_exons(Exon *exons, int offset) {
+ if (!offset || !exons)
+ return;
+
+ for (; exons; exons = exons->next_exon) {
+ if (exons->toGEN) {
+ exons->frEST += offset;
+ exons->toEST += offset;
+ }
+ }
+}
+
+
+#if 0
+static void
+add_offset_aligns(edit_script_list *aligns, int offset) {
+ if (!offset || !aligns)
+ return;
+
+ for (; aligns; aligns = aligns->next_script)
+ aligns->offset2 += offset;
+}
+#endif
+
+
+void
+Sim4::maskExonsFromSeeds(sim4command *cmd,
+ Exon *theExon) {
+
+ while (theExon) {
+ if (theExon->toGEN) {
+ for (uint32 x=0; x<cmd->numberOfExternalSeeds(); x++) {
+ uint32 pos = cmd->externalSeedGENPosition(x);
+
+ if (((uint32)theExon->frGEN <= pos + 1) &&
+ (pos <= (uint32)theExon->toGEN + cmd->externalSeedLength(x)))
+ cmd->maskExternalSeed(x);
+ }
+ }
+ theExon = theExon->next_exon;
+ }
+}
+
+void
+Sim4::maskExonsFromGenomic(Exon *theExon,
+ char *f,
+ char *r,
+ int l) {
+
+ while (theExon) {
+ if (theExon->toGEN) {
+ for (int i=theExon->frGEN-1; i<theExon->toGEN; i++)
+ f[i] = 'N';
+ for (int i=l-theExon->frGEN; i>=l-theExon->toGEN; i--)
+ r[i] = 'N';
+ }
+ theExon = theExon->next_exon;
+ }
+}
+
+
+
+
+sim4polishList*
+Sim4::run(sim4command *cmd) {
+ sim4polishBuilder B;
+ sim4polishList *L = new sim4polishList;
+
+
+ int dist, match_ori;
+ int g_pA=0, f_pA=0, r_pA=0;
+ int g_pT=0, f_pT=0, r_pT=0;
+
+ Exon *fExons = NULL;
+ Exon *rExons = NULL;
+
+ edit_script_list *fAligns = NULL;
+ edit_script_list *rAligns = NULL;
+
+ int matchesPrinted = 0;
+
+ char touppercache[256];
+
+ for (int i=0; i<256; i++)
+ touppercache[i] = (char)toupper(i);
+
+ cmd->finalize();
+
+ uint32 dblen = cmd->getGENhi() - cmd->getGENlo();
+ char *dbseq = 0L;
+ char *dbrev = 0L;
+ char *dbseqorig = cmd->getGENsequence();
+
+ int estlen = 0;
+ char *estseq = 0L;
+ char *estrev = 0L;
+ char *estseqorig = 0L;
+
+//mss_t MSS; LLL DELETE
+
+ // Allocate space for temporary sequence storage. We need
+ // to allocate space for two copies of the database, and space
+ // for the longest EST (in case we need to print it out
+ // reverse complemented).
+ //
+ char *seqStorage = 0L;
+ uint32 seqStorageSize = 0;
+
+ seqStorageSize = 2 * dblen + 2 * cmd->getESTlength() + 8;
+ seqStorage = new char [seqStorageSize];
+
+ // Original, forward, reverse, cdna
+ //
+ dbseq = seqStorage;
+ dbrev = seqStorage + dblen + 2;
+ estseq = seqStorage + dblen + 2 + dblen + 2;
+ estrev = seqStorage + dblen + 2 + dblen + 2 + cmd->getESTlength() + 2;
+
+
+ // Prepare the database sequence
+ //
+ // Trimming to the correct range
+ // Convert to uppercase
+ // Reverse complement
+ //
+ for (uint32 i=0, j=cmd->getGENlo(), k=dblen-1; j<cmd->getGENhi(); i++, j++, k--) {
+ dbseq[i] = touppercache[(int)dbseqorig[j]];
+ dbrev[k] = complementSymbol[(int)dbseq[i]];
+ }
+ dbseq[dblen] = 0;
+ dbrev[dblen] = 0;
+
+ sim4_stats_t st, rev_st;
+
+ estseqorig = cmd->getESTsequence();
+ estlen = cmd->getESTlength();
+
+ for (int i=0; i<estlen; i++)
+ estseq[i] = touppercache[(int)estseqorig[i]];
+ estseq[estlen] = 0;
+
+ g_pT = g_pA = 0;
+
+ if (globalParams->_ignorePolyTails) {
+ get_polyAT(estseq, estlen, &g_pT, &g_pA);
+ }
+
+
+ // GRRR! XXXXX This needs to be defined outside the loop, and before the goto's
+ bool pleaseContinueComputing = false;
+
+
+ if (estlen - g_pA - g_pT <= 0)
+ goto abort;
+
+
+ matchesPrinted = 0;
+
+ do {
+ //fprintf(stderr, "sim4string::main loop begins!\n");
+
+ int nmatches = 0;
+ double coverage = 0;
+ int percentid = 0;
+
+ pleaseContinueComputing = false;
+
+ B.create(cmd->getESTidx(), estlen,
+ cmd->getGENidx(), cmd->getGENlo(), cmd->getGENhi());
+
+ if (globalParams->_includeDefLine) {
+ B.setESTdefline(cmd->getESTheader());
+ B.setGENdefline(cmd->getGENheader());
+ }
+
+ memset(&st, 0, sizeof(sim4_stats_t));
+ memset(&rev_st, 0, sizeof(sim4_stats_t));
+
+ if (cmd->externalSeedsExist() == false) {
+// MSS = masks_shifts(globalParams->_spacedSeed); LLL DELETE
+ bld_table(estseq - 1 + g_pT, estlen - g_pA - g_pT, spacedSeedMSS, INIT);
+ }
+
+ if (cmd->doForward()) {
+
+ // Initialize the sequences and lengths
+ //
+ // genSeq was seq1
+ // estSeq was seq2
+ //
+ _genSeq = dbseq;
+ _estSeq = estseq + g_pT;
+ _genLen = dblen;
+ _estLen = estlen - g_pT - g_pA;
+
+ // This should be in a better spot.
+ _mspManager.setLength(_estLen);
+ _mspManager.clearDiagonal(_genLen, _estLen);
+ _mspManager.setScoreThreshold(mspThreshold1, globalParams->_interspecies);
+
+#ifdef SHOW_EXTERNAL_SEEDING
+ fprintf(stderr, "FWD: estLen = %d genLen = %d\n", _estLen, _genLen);
+#endif
+
+ // Find the seeds.
+ //
+ if (cmd->externalSeedsExist() == false) {
+ exon_cores(_genSeq-1, _estSeq-1, _genLen, _estLen, 1, 1, 0, spacedSeedMSS, mspThreshold1, PERM);
+ } else {
+#ifdef SHOW_EXTERNAL_SEEDING
+ fprintf(stderr, "FWD: Using external seeds -- adding "uint32FMT" seeds to sim4.\n", cmd->numberOfExternalSeeds());
+#endif
+
+ cmd->sortExternalSeeds();
+
+ for (uint32 x=0; x<cmd->numberOfExternalSeeds(); x++)
+ if (cmd->externalSeedLength(x) > 0)
+ _mspManager.addHit(_genSeq-1, _estSeq-1,
+ _genLen, _estLen,
+ cmd->externalSeedGENPosition(x),
+ cmd->externalSeedESTPosition(x),
+ spacedSeedMSS); // LLL 6-17/10 This doesn't make sense here (seed is probably 20mer, but not used anyway
+// cmd->externalSeedLength(x)); LLL: MUST CHANGE, using spaced seeds
+
+ exon_list = _mspManager.doLinking(DEFAULT_WEIGHT, DEFAULT_DRANGE,
+ 1, 1,
+ 0,
+ false,
+ _genSeq, _estSeq);
+
+#ifdef SHOW_EXTERNAL_SEEDING
+ fprintf(stderr, "FWD: Added and chained, starting SIM4() run.\n");
+#endif
+ }
+
+ fAligns = SIM4(&dist,
+ &fExons,
+ &f_pA,
+ &f_pT,
+ &st);
+
+ // Continued from util.C :: slide_intron()
+ //
+ // If we are forcing the strand prediction, and we are still unknown,
+ // set the strand prediction to the match orientation. Since this
+ // will be reversed later on, set it to FWD here.
+ //
+ if ((globalParams->_forceStrandPrediction) && (st.orientation == BOTH))
+ st.orientation = FWD;
+
+ // If the match was deemed expensive, report
+ //
+ if (st.tooManyMSPs) {
+ B.setNumberOfMatches(0, 0);
+ B.setPercentIdentity(0);
+ B.setMatchOrientation(SIM4_MATCH_FORWARD);
+ B.setStrandOrientation(SIM4_STRAND_INTRACTABLE);
+ B.addExon(1, estlen,
+ 1, cmd->getGENhi() - cmd->getGENlo(),
+ st.numberOfMatches, 0, 0,
+ SIM4_INTRON_NONE);
+ goto fail;
+ }
+ }
+
+ if (cmd->doReverse()) {
+ // Initialize the sequences and lengths
+ //
+ // genSeq was seq1
+ // estSeq was seq2
+ //
+ _genSeq = dbrev;
+ _estSeq = estseq + g_pT;
+ _genLen = dblen;
+ _estLen = estlen - g_pT - g_pA;
+
+ // This should be in a better spot.
+ _mspManager.setLength(_estLen);
+ _mspManager.clearDiagonal(_genLen, _estLen);
+ _mspManager.setScoreThreshold(mspThreshold1, globalParams->_interspecies);
+
+#ifdef SHOW_EXTERNAL_SEEDING
+ fprintf(stderr, "BWD: estLen = %d genLen = %d g_pT=%d g_pA=%d\n", _estLen, _genLen, g_pT, g_pA);
+#endif
+
+ // Find the seeds.
+ //
+ if (cmd->externalSeedsExist() == false) {
+ exon_cores(_genSeq-1, _estSeq-1, _genLen, _estLen, 1, 1, 0, spacedSeedMSS, mspThreshold1, PERM);
+ } else {
+#ifdef SHOW_EXTERNAL_SEEDING
+ fprintf(stderr, "BWD: Using external seeds -- adding "uint32FMT" seeds to sim4.\n", cmd->numberOfExternalSeeds());
+#endif
+
+ cmd->sortExternalSeeds();
+
+ // We have sorted the seeds in incresing genomic position,
+ // but we need to reverse everything. We can do this by just
+ // adding the seeds backwards!
+ //
+ // for (uint32 x=cmd->numberOfExternalSeeds(); x--; )
+ //
+ // Not sure _why_ we wanted to add them backwards, but it
+ // screws up the addHit logic of skipping seeds we have
+ // extended through. I vaguely remember some piece of sim4
+ // external seeding needing to be done backwards.
+ // Apparently, this isn't it.
+ //
+ for (uint32 x=0; x<cmd->numberOfExternalSeeds(); x++)
+ if (cmd->externalSeedLength(x) > 0)
+ _mspManager.addHit(_genSeq-1, _estSeq-1,
+ _genLen, _estLen,
+ cmd->externalSeedGENPosition(x),
+ cmd->externalSeedESTPosition(x),
+ spacedSeedMSS); // 6-17-10 LLL This doesn't make sense here; seed must probably be a 20-mer, but the code is unused anyway
+// cmd->externalSeedLength(x)); LLL: MUST CHANGE, using spaced seeds
+
+ exon_list = _mspManager.doLinking(DEFAULT_WEIGHT, DEFAULT_DRANGE,
+ 1, 1,
+ 0,
+ false,
+ _genSeq, _estSeq);
+#ifdef SHOW_EXTERNAL_SEEDING
+ fprintf(stderr, "BWD: Added and chained, starting SIM4() run.\n");
+#endif
+ }
+
+ rAligns = SIM4(&dist,
+ &rExons,
+ &r_pA,
+ &r_pT,
+ &rev_st);
+
+ // Continued from util.C :: slide_intron()
+ //
+ // If we are forcing the strand prediction, and we are still unknown,
+ // set the strand prediction to the match orientation.
+ //
+ if ((globalParams->_forceStrandPrediction) && (rev_st.orientation == BOTH))
+ rev_st.orientation = FWD;
+
+ // If the match was deemed expensive, report
+ if (rev_st.tooManyMSPs) {
+ B.setNumberOfMatches(0, 0);
+ B.setPercentIdentity(0);
+ B.setMatchOrientation(SIM4_MATCH_COMPLEMENT);
+ B.setStrandOrientation(SIM4_STRAND_INTRACTABLE);
+ B.addExon(1, estlen,
+ 1, cmd->getGENhi() - cmd->getGENlo(),
+ rev_st.numberOfMatches, 0, 0,
+ SIM4_INTRON_NONE);
+ goto fail;
+ }
+ }
+
+
+ if (st.numberOfMatches >= rev_st.numberOfMatches) {
+ match_ori = FWD;
+
+ if (globalParams->_ignorePolyTails) {
+ add_offset_exons(fExons, g_pT);
+
+ //add_offset_aligns(fAligns, g_pT);
+ for (edit_script_list *aligns = fAligns; aligns; aligns = aligns->next_script)
+ aligns->offset2 += g_pT;
+ }
+
+ B.setPolyTails(g_pA + f_pA, g_pT + f_pT);
+
+ if (fExons) {
+ // We used to mask the seeds down with the masking of the
+ // genomic, but reverse exons are flipped here, and we need
+ // unflipped exons to mask.
+ //
+ if (cmd->externalSeedsExist() && globalParams->_findAllExons)
+ maskExonsFromSeeds(cmd, fExons);
+
+ if (checkExonsForOverlaps(fExons)) {
+#ifdef SHOW_OVERLAPPING_EXONS
+ B.setNumberOfMatches(0, 0);
+ B.setPercentIdentity(0);
+ B.setMatchOrientation(SIM4_MATCH_FORWARD);
+ B.setStrandOrientation(SIM4_STRAND_FAILED);
+
+ // XXX: result contains the exons and alignments
+ //B.addExon(1, estlen, 1, cmd->getGENhi() - cmd->getGENlo(), rev_st.numberOfMatches, 0, SIM4_INTRON_NONE);
+#endif
+ goto fail;
+ }
+ }
+ } else {
+ match_ori = BWD;
+
+ if (globalParams->_ignorePolyTails) {
+ add_offset_exons(rExons, g_pT);
+
+ //add_offset_aligns(rAligns, g_pT);
+ for (edit_script_list *aligns = rAligns; aligns; aligns = aligns->next_script)
+ aligns->offset2 += g_pT;
+ }
+
+ B.setPolyTails(g_pA + r_pA, g_pT + r_pT);
+
+ if (rAligns && rAligns->next_script)
+ script_flip_list(&rAligns);
+
+ if (rExons) {
+ if (cmd->externalSeedsExist() && globalParams->_findAllExons)
+ maskExonsFromSeeds(cmd, rExons);
+
+ // This used to be right before appendExons() in
+ // the reverse match section, but we need it
+ // before we test for overlapping exons
+ //
+ complement_exons(&rExons, dblen, estlen);
+
+ if (checkExonsForOverlaps(rExons)) {
+#ifdef SHOW_OVERLAPPING_EXONS
+ B.setNumberOfMatches(0, 0);
+ B.setPercentIdentity(0);
+ B.setMatchOrientation(SIM4_MATCH_COMPLEMENT);
+ B.setStrandOrientation(SIM4_STRAND_FAILED);
+
+ // XXX: result contains the exons and alignments
+ //B.addExon(1, estlen, 1, cmd->getGENhi() - cmd->getGENlo(), rev_st.numberOfMatches, 0, SIM4_INTRON_NONE);
+#endif
+ goto fail;
+ }
+ }
+ }
+
+ if (match_ori == FWD) {
+ nmatches = st.numberOfMatches;
+ percentid = st.percentID;
+ } else {
+ nmatches = rev_st.numberOfMatches;
+ percentid = rev_st.percentID;
+ }
+
+ coverage = (double)nmatches / (double)estlen;
+
+
+ // Is this match decent?
+ //
+ pleaseContinueComputing = ((coverage >= globalParams->_minCoverage) &&
+ (percentid >= globalParams->_minPercentExonIdentity) &&
+ (nmatches >= globalParams->_minCoverageLength) &&
+ (nmatches > 0));
+
+ // If we're supposed to print at least _alwaysReport things,
+ // and we found a match, keep going.
+ //
+ if ((matchesPrinted < globalParams->_alwaysReport) && (nmatches > 0))
+ pleaseContinueComputing = true;
+
+ // However, if we have printed enough stuff, and the last one is
+ // below the thresholds, stop.
+ //
+ if ((matchesPrinted >= globalParams->_alwaysReport) &&
+ ((coverage < globalParams->_minCoverage) ||
+ (percentid < globalParams->_minPercentExonIdentity)))
+ pleaseContinueComputing = false;
+
+
+ if (pleaseContinueComputing) {
+ matchesPrinted++;
+
+ if (match_ori == FWD) {
+ B.setNumberOfMatches(st.numberOfMatches, st.numberOfNs);
+ B.setPercentIdentity(st.percentID);
+ B.setMatchOrientation(SIM4_MATCH_FORWARD);
+
+ switch (st.orientation) {
+ case FWD:
+ B.setStrandOrientation(SIM4_STRAND_POSITIVE);
+ break;
+ case BWD:
+ B.setStrandOrientation(SIM4_STRAND_NEGATIVE);
+ break;
+ default:
+ B.setStrandOrientation(SIM4_STRAND_UNKNOWN);
+ break;
+ }
+ } else {
+ B.setNumberOfMatches(rev_st.numberOfMatches, rev_st.numberOfNs);
+ B.setPercentIdentity(rev_st.percentID);
+ B.setMatchOrientation(SIM4_MATCH_COMPLEMENT);
+ B.setStrandOrientation(SIM4_STRAND_FAILED);
+
+ switch (rev_st.orientation) {
+ case FWD:
+ B.setStrandOrientation(SIM4_STRAND_NEGATIVE);
+ break;
+ case BWD:
+ B.setStrandOrientation(SIM4_STRAND_POSITIVE);
+ break;
+ default:
+ B.setStrandOrientation(SIM4_STRAND_UNKNOWN);
+ break;
+ }
+ }
+
+
+ // If we have external seeds, we need to mask out seeds that we
+ // used BEFORE we print alignments -- printing reverse
+ // alignments also switches from reverse-complemented genomic
+ // to reverse-complemented EST, and then we can't (easily) mask
+ // seeds!
+ //
+ // Likewise, we can't do the normal masking before we print the
+ // alignments, else we'd just print out N's for the genome.
+ //
+ if (match_ori == FWD) {
+ appendExons(B, fExons);
+
+ if (globalParams->_printAlignments) {
+ appendAlignments(B,
+ estseq, dbseq, estlen, dblen,
+ fAligns, fExons,
+ FWD);
+ }
+
+ if (globalParams->_findAllExons)
+ maskExonsFromGenomic(fExons, dbseq, dbrev, dblen);
+ } else {
+ appendExons(B, rExons);
+
+ if (globalParams->_printAlignments) {
+ for (int i=0, k=estlen-1; i<estlen; i++, k--)
+ estrev[k] = complementSymbol[(int)estseq[i]];
+ estrev[estlen] = 0;
+
+ appendAlignments(B,
+ estrev, dbseq, estlen, dblen,
+ rAligns, rExons,
+ BWD);
+ }
+
+ if (globalParams->_findAllExons)
+ maskExonsFromGenomic(rExons, dbseq, dbrev, dblen);
+ }
+ }
+
+ fail:
+
+ // These are NOT garbage collected!
+ if (fAligns) free_align(fAligns);
+ if (rAligns) free_align(rAligns);
+
+ // These ARE garbage collected
+ //freeExonList(fExons);
+ //freeExonList(rExons);
+
+ fAligns = rAligns = 0L;
+ fExons = rExons = 0L;
+
+ L->push(B.release());
+ } while (globalParams->_findAllExons && pleaseContinueComputing);
+
+ abort:
+
+ delete [] seqStorage;
+
+ return(L);
+}
+
+
+
+
+
+
+////////////////////////////////////////////////////////////
+//
+// Exons
+//
+////////////////////////////////////////////////////////////
+
+
+bool
+Sim4::checkExonsForOverlaps(Exon *theExons) {
+ Exon *a = theExons;
+ Exon *b = theExons->next_exon;
+
+ while (b && b->toGEN) {
+ if ((b->frGEN <= a->toGEN) ||
+ (b->frEST <= a->toEST)) {
+ return(true);
+ }
+
+ a = b;
+ b = b->next_exon;
+ }
+
+ return(false);
+}
+
+
+
+void
+Sim4::appendExons(sim4polishBuilder &B, Exon *theExons) {
+ Exon *theExon = theExons;
+
+ while (theExon) {
+ if (theExon->toGEN) {
+
+#ifdef SPLSCORE
+ // Save the splice score (theExon->splScore);
+ // "%d-%d (%d-%d) <%d-%d-%d> %1.2f %s"
+#error I do not know how to save the splice score!
+#endif
+
+ char ori = SIM4_INTRON_NONE;
+
+ if ((theExon->next_exon) && (theExon->next_exon->toGEN)) {
+ switch (theExon->ori) {
+ case 'C': // <-
+ ori = SIM4_INTRON_NEGATIVE;
+ break;
+ case 'E': // ==
+ ori = SIM4_INTRON_GAP;
+ break;
+ case 'G': // ->
+ ori = SIM4_INTRON_POSITIVE;
+ break;
+ case 'N': // --
+ ori = SIM4_INTRON_AMBIGUOUS;
+ break;
+ default:
+ ori = SIM4_INTRON_ERROR;
+ break;
+ }
+ }
+
+ B.addExon(theExon->frEST, theExon->toEST,
+ theExon->frGEN, theExon->toGEN,
+ theExon->numMatches,
+ theExon->numNs,
+ theExon->percentID,
+ ori);
+ }
+
+ theExon = theExon->next_exon;
+ }
+}
+
+
+////////////////////////////////////////////////////////////
+//
+// Alignments
+//
+////////////////////////////////////////////////////////////
+
+
+
+
+void
+Sim4::IDISPLAY(sim4polishBuilder &builder,
+ char *aString,
+ char *bString,
+ char *A,
+ char *B,
+ int M,
+ int N,
+ int *S,
+ int AP,
+ int BP,
+ int est_strand,
+ Exon *exons) {
+ Exon *t0;
+ register int i, j, op;
+ int starti, is_intron=0;
+
+ if ((exons==NULL) || (!exons->toGEN && (exons->next_exon==NULL))) {
+ builder.addExonAlignment("Empty exon list; no alignment possible!",
+ "Empty exon list; no alignment possible!");
+ return;
+ }
+
+ /* find the starting exon for this alignment */
+ t0 = exons;
+ while (t0 && (((est_strand==2) && ((t0->frGEN!=AP) || (t0->frEST!=BP))) ||
+ ((est_strand==1) && ((t0->frGEN!=BP) || (t0->frEST!=AP))))) {
+ t0 = t0->next_exon;
+ }
+
+ if (!t0) {
+ builder.addExonAlignment("Alignment fragment not found; no alignment possible!",
+ "Alignment fragment not found; no alignment possible!");
+ return;
+ }
+
+ i = j = op = 0;
+
+ starti = (t0->next_exon && t0->next_exon->toGEN) ? (t0->toGEN+1):-1;
+
+ char *a = aString;
+ char *b = bString;
+
+#if 0
+ fprintf(stderr, "M=%d N=%d\n", M, N);
+ fprintf(stderr, "aString=0x%p\nbString=0x%p\n", aString, bString);
+#endif
+
+ while (i < M || j < N) {
+ *a = *b = 0;
+#if 0
+ fprintf(stderr, "i=%d < M=%d and j=%d < N=%d\n", i, M, j, N);
+ fprintf(stderr, "a=%s\n", aString);
+ fprintf(stderr, "b=%s\n", bString);
+#endif
+
+ if (op == 0 && *S == 0) {
+ op = *S++;
+ i++;
+ j++;
+ if (A[i] == B[j]) {
+ *a++ = (char)(A[i] + 'a' - 'A');
+ *b++ = (char)(B[j] + 'a' - 'A');
+ } else {
+ *a++ = A[i];
+ *b++ = B[j];
+ }
+ } else {
+ if (op == 0)
+ op = *S++;
+
+ if (op > 0) {
+ if (est_strand==2) {
+ *a++ = '-';
+ *b++ = B[++j];
+ op--;
+ } else {
+ if (j+BP==starti) {
+ /* detected intron */
+ t0 = t0->next_exon;
+ starti=(t0->next_exon && t0->next_exon->toGEN)?(t0->toGEN+1):-1;
+ /* print entire exon */
+ is_intron = 1;
+ j += op;
+ op = 0;
+ } else {
+ *a++ = '-';
+ *b++ = B[++j];
+ op--;
+ }
+ }
+ } else {
+ if (est_strand==1) {
+ *a++ = A[++i];
+ *b++ = '-';
+ op++;
+ } else {
+ if (i+AP==starti) {
+ /* detected intron */
+ t0 = t0->next_exon;
+ starti=(t0->next_exon && t0->next_exon->toGEN)?(t0->toGEN+1):-1;
+ is_intron = 1;
+ i += -op;
+ op = 0;
+ } else {
+ *a++ = A[++i];
+ *b++ = '-';
+ op++;
+ }
+ }
+ }
+ }
+
+ if (is_intron || ((i >= M) && (j >= N))) {
+ *a = 0;
+ *b = 0;
+
+ builder.addExonAlignment(aString, bString);
+
+ a = aString;
+ b = bString;
+
+ is_intron = 0;
+ }
+ }
+}
+
+
+
+
+void
+Sim4::S2A(edit_script *head, int *S) {
+ edit_script *tp;
+ int *lastS, i;
+
+ tp = head;
+ lastS = S;
+ while (tp != NULL) {
+ if (tp->op_type == SUBSTITUTE) {
+ for (i=0; i<tp->num; ++i)
+ *lastS++ = 0;
+ } else if (tp->op_type == INSERT) {
+ *lastS++ = -tp->num;
+ } else { /* DELETE */
+ *lastS++ = tp->num;
+ }
+ tp = tp->next;
+ }
+ *(S-1) = (int)(lastS - S);
+}
+
+
+
+
+void
+Sim4::appendAlignments(sim4polishBuilder &builder,
+ char *s1,
+ char *s2,
+ int l1,
+ int l2,
+ edit_script_list *Aligns,
+ Exon *Exons,
+ int match_ori) {
+
+ if (Aligns==NULL)
+ return;
+
+ // Detemine the maximum length of an alignment by finding the
+ // longest exon.
+ //
+ int maxAlignmentLength = 0;
+ Exon *theExon = Exons;
+
+ while (theExon) {
+ if (theExon->toGEN) {
+ if (maxAlignmentLength < (theExon->toGEN - theExon->frGEN + theExon->toEST - theExon->frEST))
+ maxAlignmentLength = theExon->toGEN - theExon->frGEN + theExon->toEST - theExon->frEST;
+ }
+
+ theExon = theExon->next_exon;
+ }
+
+ char *aString = new char [maxAlignmentLength + 4];
+ char *bString = new char [maxAlignmentLength + 4];
+
+ for(edit_script_list *aligns = Aligns; aligns; aligns = aligns->next_script) {
+ int *S = (int *)ckalloc((2 * aligns->len2 + 1 + 1) * sizeof(int));
+ S++;
+ S2A(aligns->script, S);
+
+ if (match_ori==FWD) {
+ IDISPLAY(builder,
+ aString,
+ bString,
+ s1 + aligns->offset2 - 1 - 1,
+ s2 + aligns->offset1 - 1 - 1,
+ aligns->len2,
+ aligns->len1,
+ S,
+ aligns->offset2,
+ aligns->offset1,
+ 1,
+ Exons);
+ } else {
+ align_reverse(S);
+ IDISPLAY(builder,
+ aString,
+ bString,
+ s1 + l1 + 1 - (aligns->offset2 + aligns->len2 - 1) - 1 - 1,
+ s2 + l2 + 1 - (aligns->offset1 + aligns->len1 - 1) - 1 - 1,
+ aligns->len2,
+ aligns->len1,
+ S,
+ l1 + 1 - (aligns->offset2+aligns->len2 - 1),
+ l2 + 1 - (aligns->offset1+aligns->len1 - 1),
+ 1,
+ Exons);
+ }
+ ckfree(S-1);
+ }
+
+ delete [] aString;
+ delete [] bString;
+}
+
diff --git a/libsim4/sim4core/sites.C b/libsim4/sim4core/sites.C
new file mode 100644
index 0000000..225785c
--- /dev/null
+++ b/libsim4/sim4core/sites.C
@@ -0,0 +1,820 @@
+//Copyright (c) 2003 by Mihaela Pertea
+
+
+#include "sim4.H"
+#include "sites_score.H"
+#include "sites_donor.H"
+#include "sites_acceptor.H"
+
+char DONOR_TREE[] = "( 0 2 4 10000 l( 1 2 9 7841 l( 3 0 8 5666 l( 5 0 3 3977 l( 7 0 7 2186 l( 9 -1 -1 995 l r ) r( 10 -1 -1 1191 l r ) ) r( 8 3 10 1791 l( 15 -1 -1 931 l r ) r( 16 -1 -1 860 l r ) ) ) r( 6 -1 -1 1689 l r ) ) r( 4 -1 -1 2175 l r ) ) r( 2 -1 -1 2159 l r ) )"; // \n5 20\n";
+
+char ACCEPTOR_TREE[] = "( 0 1 23 10000 l( 1 3 21 6544 l( 3 3 20 3573 l( 5 3 16 2146 l( 7 -1 -1 1295 l r ) r( 8 -1 -1 851 l r ) ) r( 6 -1 -1 1427 l r ) ) r( 4 1 21 2971 l( 15 1 20 1914 l( 17 -1 -1 1009 l r ) r( 18 -1 -1 905 l r ) ) r( 16 -1 -1 1057 l r ) ) ) r( 2 -1 -1 3456 l r ) )"; // \n44 72\n";
+
+#define TRUE 1
+#define FALSE 0
+#define ACCEPTOR_LEN 29 /* Positions +44,72 in a80 */
+#define ACCEPTOR_SIGNAL_OFFSET 24 /* Start of AG */
+
+#define DONOR_LEN 16 /* Positions +5,20 in d80 */
+#define DONOR_SIGNAL_OFFSET 5 /* Start of GT */
+
+#define MARKOV_DEGREE 3
+#define MARKOV_LEN 64 /* ALPHABET_SIZE ^ MARKOV_DEGREE */
+#define LOW_SCORE -99.0 /* Score if pattern does not have GT or AG signal */
+
+
+#define SITE_LEN 162
+
+#define CODING_LEN 80
+
+#ifndef EXIT_FAILURE
+ #define EXIT_FAILURE -1
+#endif
+#ifndef EXIT_SUCCESS
+ #define EXIT_SUCCESS 0
+#endif
+
+typedef struct tree {
+ int val;
+ int consens;
+ int poz;
+ int no;
+ struct tree *left;
+ struct tree *right;
+ } tree;
+
+void postorder(tree *root)
+{
+ if(root)
+ {
+ postorder(root->left);
+ postorder(root->right);
+ printf("[%d %d %d %d] ", root->val, root->consens, root->poz, root->no);
+ }
+}
+
+typedef unsigned int word;
+
+int Acc (const int *, double *,tree *t,int ind);
+int Don (const int *, double *, tree *t,int ind);
+int comp(const void *a, const void *b);
+int findfile(const int * S, tree *t);
+int readtree(Sim4 *S4, char *line, tree *t, int start);
+int find(char *line, int start);
+int Is_Cod_NonCod (const int * , double *, int ind);
+float ****Load4dim(Sim4 *S4, int d1, int d2, int d3, int d4);
+void free4dim(Sim4 *S4, float ****ptr,int d1, int d2, int d3);
+
+#define Start_PosEx 56
+#define Stop_PosEx 84
+
+#define Start_PosIn 75
+#define Stop_PosIn 90
+
+#define Start_Cod 0
+#define Stop_Cod 79
+
+#define Start_NoCod 82
+#define Stop_NoCod 161
+
+
+int markov_degree;
+int markov_len;
+tree *tacc = NULL;
+tree *tdon = NULL;
+int readtacc=FALSE;
+int readtdon=FALSE;
+int accmax = 0;
+int donmax = 0;
+float ****Acc_Positive_Table = NULL;
+float ****Acc_Negative_Table = NULL;
+int *Acc_Tables_Loaded = NULL;
+float ****Don_Positive_Table = NULL;
+float ****Don_Negative_Table = NULL;
+int *Don_Tables_Loaded = NULL;
+float Cod_Positive_Table [4][CODING_LEN] [ALPHABET_SIZE] [MARKOV_LEN];
+float Cod_Negative_Table [4][CODING_LEN] [ALPHABET_SIZE] [MARKOV_LEN];
+int Cod_Tables_Loaded[4] = {FALSE,FALSE,FALSE,FALSE};
+
+void
+Sim4::loadGeneSplicerModel()
+{
+ int i;
+
+ markov_degree=1;
+ markov_len=(int)pow(ALPHABET_SIZE,1);
+
+ if(!readtdon) {
+
+ tdon = (tree *) malloc(sizeof(tree));
+ if (tdon == NULL) {fprintf(stderr,"Memory allocation for tree failure.\n"); abort();}
+
+ donmax=readtree(this, DONOR_TREE, tdon, 0);
+ readtdon=TRUE;
+
+ // alloc memory for the tables
+ Don_Positive_Table=Load4dim(this,donmax,DONOR_LEN,ALPHABET_SIZE,markov_len);
+ Don_Negative_Table=Load4dim(this,donmax,DONOR_LEN,ALPHABET_SIZE,markov_len);
+ Don_Tables_Loaded=(int *) malloc(donmax*sizeof(int));
+ if(Don_Tables_Loaded == NULL) {
+ fprintf(stderr,"Memory allocation for donor site tables failed.\n");
+ abort();
+ }
+ for(i=0;i<donmax;i++) Don_Tables_Loaded[i]=FALSE;
+ }
+
+ if(!readtacc) {
+
+ // read the structure of the acceptor tree
+ tacc = (tree *) malloc(sizeof(tree));
+ if (tacc == NULL) {fprintf(stderr," Memory allocation for tree failure.\n"); abort();}
+ accmax=readtree(this, ACCEPTOR_TREE, tacc, 0);
+
+#ifdef DEBUG
+ printf("readtacc = %d when readtacc should be 0\n", readtacc);
+ printf("accmax = %d\n", accmax);
+ postorder(tacc);
+ printf("\n");
+#endif
+
+ readtacc=TRUE;
+
+ // alloc memory for the tables
+ Acc_Positive_Table=Load4dim(this,accmax,ACCEPTOR_LEN,ALPHABET_SIZE,markov_len);
+ Acc_Negative_Table=Load4dim(this,accmax,ACCEPTOR_LEN,ALPHABET_SIZE,markov_len);
+ Acc_Tables_Loaded=(int *) malloc(accmax*sizeof(int));
+ if(Acc_Tables_Loaded == NULL) {
+ fprintf(stderr,"Memory allocation for acceptor site tables failed.\n");
+ abort();
+ }
+ for(i=0;i<accmax;i++) Acc_Tables_Loaded[i]=FALSE;
+ }
+}
+
+#if 1
+// This stuff is now garbage collected.
+void free4dim(float ****ptr,int d1, int d2, int d3)
+{
+ int i,j,k;
+
+ for(i=0;i<d1;i++) {
+ for(j=0;j<d2;j++) {
+ for(k=0;k<d3;k++) {
+ if(ptr[i][j][k] != NULL )
+ free(ptr[i][j][k]);
+ }
+ if(ptr[i][j] != NULL )
+ free(ptr[i][j]);
+ }
+ if(ptr[i] != NULL )
+ free(ptr[i]);
+ }
+ free(ptr);
+}
+
+
+void freetree(tree *t)
+{
+ if(t==NULL) return;
+ freetree(t->left);
+ freetree(t->right);
+ free(t);
+ t=NULL;
+}
+#endif
+
+void
+Sim4::UnLoadSites_GeneSplicer()
+{
+ int i;
+
+ // Garbage collected! (not yet - needs palloc)
+ if(readtacc) {
+ free4dim(Acc_Positive_Table,accmax,ACCEPTOR_LEN,ALPHABET_SIZE);
+ free4dim(Acc_Negative_Table,accmax,ACCEPTOR_LEN,ALPHABET_SIZE);
+ if(Acc_Tables_Loaded != NULL ) free(Acc_Tables_Loaded);
+ }
+
+ // Garbage collected! (not yet - needs palloc)
+ if(readtdon) {
+ free4dim(Don_Positive_Table,donmax,DONOR_LEN,ALPHABET_SIZE);
+ free4dim(Don_Negative_Table,donmax,DONOR_LEN,ALPHABET_SIZE);
+ if(Don_Tables_Loaded != NULL ) free(Don_Tables_Loaded);
+ }
+
+#ifdef DEBUG
+ printf("tacc:\n");
+ postorder(tacc);
+ printf("\n");
+#endif
+
+ // Garbage collected! (not yet - needs palloc)
+ if(readtacc)
+ freetree(tacc);
+
+#ifdef DEBUG
+ printf("tdon:\n");
+ postorder(tdon);
+ printf("\n");
+#endif
+
+ // Garbage collected! (not yet - needs palloc)
+ if(readtdon)
+ freetree(tdon);
+
+ readtacc=FALSE;
+ readtdon=FALSE;
+
+ for(i=0;i<4;i++)
+ Cod_Tables_Loaded[i]=FALSE;
+
+}
+
+float ****Load4dim(Sim4 *S4, int d1, int d2, int d3, int d4)
+{
+ int i,j,k;
+ float ****ptr;
+
+ ptr = (float ****) malloc(d1 * sizeof(float ***));
+ if(ptr==NULL) {
+ fprintf(stderr,"Memory allocation for splice site tables failed.\n");
+ abort();
+ }
+ for(i=0;i<d1;i++) {
+ ptr[i] = (float ***) malloc(d2 * sizeof(float **));
+ if(ptr[i]==NULL) {
+ fprintf(stderr,"Memory allocation for splice site tables failed.\n");
+ abort();
+ }
+ for(j=0;j<d2;j++) {
+ ptr[i][j] = (float **) malloc(d3*sizeof(float *));
+ if(ptr[i][j]==NULL) {
+ fprintf(stderr,"Memory allocation for splice site tables failed.\n");
+ abort();
+ }
+ for(k=0;k<d3;k++) {
+ ptr[i][j][k] = (float *) malloc(d4*sizeof(float));
+ if(ptr[i][j][k]==NULL) {
+ fprintf(stderr,"Memory allocation for splice site tables failed.\n");
+ abort();
+ }
+ }
+ }
+ }
+
+ return(ptr);
+}
+
+
+double
+Sim4::ScoreAcceptor_GeneSplicer(char *Data)
+{
+ double Score,S1,S2;
+ int i,ind;
+ int T[100];
+ double score1,score2,score3;
+ char *B = Data;
+
+#if 0
+ assert( strlen(Data) >= SITE_LEN);
+
+ for(i=0;i<SITE_LEN;i++) {
+ switch (Data[i]){
+ case 'A':
+ case 'a': B[i]=0;break;
+ case 'C':
+ case 'c': B[i]=1;break;
+ case 'G':
+ case 'g': B[i]=2;break;
+ case 'T':
+ case 't': B[i]=3;break;
+ default: B[i]=0;
+ }
+ }
+#endif
+
+#if 0
+ /* moved to loadGeneSplicerModel */
+ markov_degree=1;
+ markov_len=(int)pow(ALPHABET_SIZE,1);
+
+ if(!readtacc) {
+
+ // read the structure of the acceptor tree
+ tacc = (tree *) malloc(sizeof(tree));
+ if (tacc == NULL) {fprintf(stderr," Memory allocation for tree failure.\n"); abort();}
+ accmax=readtree(this, ACCEPTOR_TREE, tacc, 0);
+
+#ifdef DEBUG
+ printf("readtacc = %d when readtacc should be 0\n", readtacc);
+ printf("accmax = %d\n", accmax);
+ postorder(tacc);
+ printf("\n");
+#endif
+
+ readtacc=TRUE;
+
+ // alloc memory for the tables
+ Acc_Positive_Table=Load4dim(this, accmax,ACCEPTOR_LEN,ALPHABET_SIZE,markov_len);
+ Acc_Negative_Table=Load4dim(this, accmax,ACCEPTOR_LEN,ALPHABET_SIZE,markov_len);
+ Acc_Tables_Loaded=(int *) malloc(accmax*sizeof(int));
+ if(Acc_Tables_Loaded == NULL) {
+ fprintf(stderr,"Memory allocation for acceptor site tables failed.\n");
+ abort();
+ }
+ for(i=0;i<accmax;i++) Acc_Tables_Loaded[i]=FALSE;
+ }
+#endif
+
+
+ for(i=0;i<=Stop_PosEx-Start_PosEx;i++)
+ T[i]=B[i+Start_PosEx];
+
+ ind=Acc(T, &S1, tacc,0);
+ if(ind==0) return(0);
+
+ if(accmax>1) Acc(T, &S2, tacc,1);
+ else S2=S1;
+ score1=(S1+S2)/2;
+
+ // if(score1<=THR_ACC) score1=-99;
+
+ score2=0;
+ score3=0;
+
+ for(i=0;i<=Stop_NoCod-Start_NoCod;i++)
+ T[i]=B[i+Start_NoCod];
+
+ Is_Cod_NonCod(T,&score2,0);
+
+ for(i=0;i<=Stop_Cod-Start_Cod;i++)
+ T[i]=B[i+Start_Cod];
+
+
+ Is_Cod_NonCod(T,&score3,1);
+
+// printf("score1 = %.5f, score2 = %.5f, score3 = %.5f\n", score1, score2, score3);
+ Score=score1+score2+score3;
+
+ return(Score);
+
+
+}
+
+double
+Sim4::ScoreDonor_GeneSplicer(char *Data)
+{
+ double Score,S1,S2;
+ int ind,i;
+ int T[100];
+ double score1,score2,score3;
+ char *B = Data;
+
+#if 0
+ assert( strlen(Data) >= SITE_LEN);
+
+ for(i=0;i<SITE_LEN;i++) {
+ switch (Data[i]){
+ case 'A':
+ case 'a': B[i]=0;break;
+ case 'C':
+ case 'c': B[i]=1;break;
+ case 'G':
+ case 'g': B[i]=2;break;
+ case 'T':
+ case 't': B[i]=3;break;
+ default: B[i]=0;
+ }
+ }
+#endif
+
+#if 1
+ /* LLL moved to loadGeneSplicerModel */
+ markov_degree=1;
+ markov_len=(int)pow(ALPHABET_SIZE,1);
+
+ if(!readtdon) {
+
+ tdon = (tree *) malloc(sizeof(tree));
+ if (tdon == NULL) {fprintf(stderr,"Memory allocation for tree failure.\n"); abort();}
+
+ donmax=readtree(this, DONOR_TREE, tdon, 0);
+ readtdon=TRUE;
+
+ // alloc memory for the tables
+ Don_Positive_Table=Load4dim(this, donmax,DONOR_LEN,ALPHABET_SIZE,markov_len);
+ Don_Negative_Table=Load4dim(this, donmax,DONOR_LEN,ALPHABET_SIZE,markov_len);
+ Don_Tables_Loaded=(int *) malloc(donmax*sizeof(int));
+ if(Don_Tables_Loaded == NULL) {
+ fprintf(stderr,"Memory allocation for donor site tables failed.\n");
+ abort();
+ }
+ for(i=0;i<donmax;i++) Don_Tables_Loaded[i]=FALSE;
+ }
+#endif
+
+ for(i=0;i<=Stop_PosIn-Start_PosIn;i++)
+ T[i]=B[i+Start_PosIn];
+
+ ind=Don(T, &S1, tdon,0);
+ if(ind==0) return(0);
+ if(donmax>1) Don(T, &S2, tdon,1);
+ else S2=S1;
+ score1=(S1+S2)/2;
+
+
+ score2=0;
+ score3=0;
+
+ for(i=0;i<=Stop_Cod-Start_Cod;i++)
+ T[i]=B[i+Start_Cod];
+
+ Is_Cod_NonCod(T,&score2,2);
+
+
+ for(i=0;i<=Stop_NoCod-Start_NoCod;i++)
+ T[i]=B[i+Start_NoCod];
+
+ Is_Cod_NonCod(T,&score3,3);
+
+
+ Score=score1+score2+score3;
+
+ return Score;
+
+
+}
+
+
+
+int readtree(Sim4 *S4, char *line, tree *t, int start)
+{
+ int len;
+ int i,n;
+ int val,valmax;
+ char part[10];
+ len=strlen(line);
+
+ i=start;
+ while((line[i]=='(')||(line[i]==' ')) i++;
+ n=i;
+ while(line[i]!=' ')
+ {
+ part[i-n]=line[i];
+ i++;
+ }
+ part[i-n]='\0';
+ t->val=atoi(part);
+ valmax=t->val;
+
+ i++;
+ n=i;
+ while(line[i]!=' ')
+ {
+ part[i-n]=line[i];
+ i++;
+ }
+ part[i-n]='\0';
+ t->consens=atoi(part);
+
+ i++;
+ n=i;
+ while(line[i]!=' ')
+ {
+ part[i-n]=line[i];
+ i++;
+ }
+ part[i-n]='\0';
+ t->poz=atoi(part);
+
+ i++;
+ n=i;
+ while(line[i]!=' ')
+ {
+ part[i-n]=line[i];
+ i++;
+ }
+ part[i-n]='\0';
+ t->no=atoi(part);
+
+ t->left=NULL;
+ t->right=NULL;
+
+ i+=2;n=i;
+ if(line[i]=='(')
+ {
+ i=find(line,i+1);
+ t->left = (tree *) malloc(sizeof(tree));
+ if (t->left == NULL) {fprintf(stderr,"Memory allocation for tree failure.\n"); abort();}
+ val=readtree(S4,line,t->left,n);
+ if(val>valmax) valmax=val;
+ }
+
+ i+=2;n=i;
+ if(line[i]=='(')
+ {
+ i=find(line,i+1);
+ t->right = (tree *) malloc(sizeof(tree));
+ if (t->right == NULL) {
+ fprintf(stderr,"Memory allocation for tree failure.\n");
+ abort();
+ }
+ val=readtree(S4,line,t->right,n);
+ if(val>valmax) valmax=val;
+ }
+ valmax++;
+ return(valmax);
+}
+
+int find(char *line, int start)
+{
+ int stop,i;
+
+ i=start;
+
+ while(line[i]!=')')
+ if(line[i]=='(') i=find(line,i+1);
+ else i++;
+ stop=i+1;
+ return(stop);
+}
+
+
+int comp(const void *a, const void *b)
+{
+ if(*(double *)a > *(double *)b) return(1);
+ else if (*(double *)a==*(double *)b) return(0);
+ else return(-1);
+
+}
+
+
+int findfile(const int * S, tree *t)
+{
+ int val, cons, poz;
+ val=t->val;
+
+ cons=t->consens;
+ if( cons !=-1)
+ {
+ poz=t->poz;
+ if(S[poz]==cons)
+ val=findfile(S,t->left);
+ else val=findfile(S, t->right);
+ }
+
+ return(val);
+}
+
+int findleaf(tree *t, int n, int leaf, int *found)
+{
+ int ret=n;
+
+ if(t==NULL) { fprintf(stderr,"tree NULL\n");exit(0);}
+
+ if(t->val == leaf) {*found=1; return(n+1);}
+
+ if(t->left == NULL && t->right == NULL) return(n+1);
+ if(t->left != NULL) ret=findleaf(t->left,n,leaf,found);
+ if(!(*found) && t->right != NULL) ret=findleaf(t->right,ret,leaf,found);
+
+ return(ret);
+}
+
+
+
+
+
+
+int Acc (const int * S, double * Return_Score, tree *t,int ind)
+
+/* Evaluate string S [0 .. (ACCEPTOR_LEN -1)] and
+* return TRUE or FALSE as to whether it is a likely acceptor
+* site. Also set Return_Score to the probability that it is an acceptor
+* site. */
+
+{
+ double Positive_Sum, Negative_Sum, Score;
+#if RETURN_TRUE_PROB
+ double X, Y;
+#endif
+ int i, j, k, Sub, no, idx;
+
+/* see which acceptor you should use */
+
+ if(ind) {
+ no=findfile(S,t);
+ k=0;
+ }
+ else
+ no=0;
+
+ idx = 0;
+ if (! Acc_Tables_Loaded[no])
+ {
+ for (i = markov_degree - 1; i < ACCEPTOR_LEN; i ++)
+ for (k = 0; k < markov_len; k ++)
+ for (j = 0; j < ALPHABET_SIZE; j ++)
+ {
+ Acc_Positive_Table[no][i][j][k] = acc[no][idx++];
+ }
+
+ for (i = markov_degree - 1; i < ACCEPTOR_LEN; i ++)
+ for (k = 0; k < markov_len; k ++)
+ for (j = 0; j < ALPHABET_SIZE; j ++)
+ {
+ Acc_Negative_Table[no][i][j][k] = acc[no][idx++];
+ }
+
+ Acc_Tables_Loaded[no] = TRUE;
+ }
+
+
+ /*
+ if (S [ACCEPTOR_SIGNAL_OFFSET] != 0
+ || S [ACCEPTOR_SIGNAL_OFFSET + 1] != 2) // AG
+ {
+ * Return_Score = LOW_SCORE;
+ return FALSE;
+ }
+ */
+
+ Sub = 0;
+ for (i = 0; i < markov_degree; i ++)
+ Sub = ALPHABET_SIZE * Sub + S [i];
+
+ Positive_Sum = Acc_Positive_Table [no][markov_degree - 1] [0] [Sub];
+ Negative_Sum = Acc_Negative_Table [no][markov_degree - 1] [0] [Sub];
+
+ for (i = markov_degree; i < ACCEPTOR_LEN; i ++)
+ {
+ j = S [i];
+ Positive_Sum += Acc_Positive_Table [no] [i] [j] [Sub];
+ Negative_Sum += Acc_Negative_Table [no] [i] [j] [Sub];
+ Sub = ALPHABET_SIZE * (Sub % (markov_len / ALPHABET_SIZE)) + j;
+ }
+
+
+
+ Score = Positive_Sum - Negative_Sum;
+
+ * Return_Score = Score;
+
+ return(1);
+ }
+
+
+
+int Don (const int * S, double * Return_Score, tree *t,int ind)
+
+/* Evaluate string S [0 .. (DONOR_LEN -1)] and
+* return TRUE or FALSE as to whether it is a likely donor
+* site. Also set Return_Score to the probability that it is an donor
+* site. */
+{
+ double Positive_Sum, Negative_Sum, Score;
+ int no;
+
+#if RETURN_TRUE_PROB
+ double X, Y;
+#endif
+ int i, j, k, Sub, idx;
+
+ /* see which donor file you should use */
+ if(ind) {
+ no=findfile(S,t);
+ k=0;
+ }
+ else
+ no=0;
+
+ idx = 0;
+ if (! Don_Tables_Loaded[no] )
+ {
+ for (i = markov_degree - 1; i < DONOR_LEN; i ++)
+ for (k = 0; k < markov_len; k ++)
+ for (j = 0; j < ALPHABET_SIZE; j ++)
+ {
+ Don_Positive_Table[no][i][j][k] = don[no][idx++];
+ }
+
+ for (i = markov_degree - 1; i < DONOR_LEN; i ++)
+ for (k = 0; k < markov_len; k ++)
+ for (j = 0; j < ALPHABET_SIZE; j ++)
+ {
+ Don_Negative_Table[no][i][j][k] = don[no][idx++];
+ }
+ Don_Tables_Loaded [no] = TRUE;
+ }
+
+ /*
+ if (S [DONOR_SIGNAL_OFFSET] != 2
+ || S [DONOR_SIGNAL_OFFSET + 1] != 3) // GT
+ {
+ * Return_Score = LOW_SCORE;
+ return FALSE;
+ }
+ */
+
+ Sub = 0;
+ for (i = 0; i < markov_degree; i ++)
+ Sub = ALPHABET_SIZE * Sub + S [i];
+
+ Positive_Sum = Don_Positive_Table [no] [markov_degree - 1] [0] [Sub];
+ Negative_Sum = Don_Negative_Table [no] [markov_degree - 1] [0] [Sub];
+
+ for (i = markov_degree; i < DONOR_LEN; i ++)
+ {
+ j = S [i];
+ Positive_Sum += Don_Positive_Table [no] [i] [j] [Sub];
+ Negative_Sum += Don_Negative_Table [no] [i] [j] [Sub];
+ Sub = ALPHABET_SIZE * (Sub % (markov_len / ALPHABET_SIZE)) + j;
+ }
+
+ Score = Positive_Sum - Negative_Sum;
+
+ * Return_Score = Score;
+
+ return(1);
+ }
+
+
+int Is_Cod_NonCod (const int * S, double * Return_Score, int ind)
+
+/* Evaluate string S [0 .. (CODING_LEN -1)] and
+* return TRUE or FALSE as to whether it is a likely donor
+* site. Also set Return_Score to the probability that it is an donor
+* site. */
+
+ {
+ double Positive_Sum, Negative_Sum, Score;
+ double *scores;
+ int no;
+
+
+#if RETURN_TRUE_PROB
+ double X, Y;
+#endif
+ int i, j, k, Sub, idx;
+
+ no=ind;
+
+ switch (no) {
+ case 0: // case of exon in acceptor
+ scores = score_ex_acc;
+ break;
+ case 1: // case of intron in acceptor
+ scores = score_in_acc;
+ break;
+ case 2: // case of exon in donor
+ scores = score_ex_don;
+ break;
+ case 3: // case of intron in donor
+ scores = score_in_don;
+ break;
+ }
+
+ idx = 0;
+ if (! Cod_Tables_Loaded[no] )
+ {
+ for (i = markov_degree - 1; i < CODING_LEN; i ++)
+ for (k = 0; k < markov_len; k ++)
+ for (j = 0; j < ALPHABET_SIZE; j ++)
+ {
+ Cod_Positive_Table[no][i][j][k] = scores[idx++];
+ }
+
+ for (i = markov_degree - 1; i < CODING_LEN; i ++)
+ for (k = 0; k < markov_len; k ++)
+ for (j = 0; j < ALPHABET_SIZE; j ++)
+ {
+ Cod_Negative_Table[no][i][j][k] = scores[idx++];
+ }
+
+ Cod_Tables_Loaded [no] = TRUE;
+ }
+
+ Sub = 0;
+ for (i = 0; i < markov_degree; i ++)
+ Sub = ALPHABET_SIZE * Sub + S [i];
+
+ Positive_Sum = Cod_Positive_Table [no] [markov_degree - 1] [0] [Sub];
+ Negative_Sum = Cod_Negative_Table [no] [markov_degree - 1] [0] [Sub];
+
+ for (i = markov_degree; i < CODING_LEN; i ++)
+ {
+ j = S [i];
+ Positive_Sum += Cod_Positive_Table [no] [i] [j] [Sub];
+ Negative_Sum += Cod_Negative_Table [no] [i] [j] [Sub];
+ Sub = ALPHABET_SIZE * (Sub % (markov_len / ALPHABET_SIZE)) + j;
+ }
+
+
+
+ Score = Positive_Sum - Negative_Sum;
+
+ * Return_Score = Score;
+
+ return (1);
+ }
+
diff --git a/libsim4/sim4core/sites_acceptor.C b/libsim4/sim4core/sites_acceptor.C
new file mode 100644
index 0000000..f1afc8a
--- /dev/null
+++ b/libsim4/sim4core/sites_acceptor.C
@@ -0,0 +1,2402 @@
+#include "sim4.H"
+
+/* DO NOT REMOVE or MODIFY !!!! */
+
+double acc[NUM_MODELS_ACC][NUM_VALUES_ACC] =
+{/*, acc[0]=..., */
+{-1.345152, 0.000100, 0.000100, 0.000100,
+-1.516403, 0.000100, 0.000100, 0.000100,
+-1.848330, 0.000100, 0.000100, 0.000100,
+-1.014731, 0.000100, 0.000100, 0.000100,
+-1.089822, -1.526594, -1.951685, -1.189365,
+-1.292020, -1.351816, -3.183077, -0.855532,
+-1.224284, -1.536010, -1.873073, -1.087249,
+-1.572873, -1.590312, -1.469376, -1.025490,
+-1.140688, -1.477160, -1.993420, -1.152370,
+-1.368373, -1.352513, -3.019521, -0.825388,
+-1.306903, -1.476802, -1.836329, -1.074190,
+-1.608595, -1.565977, -1.488318, -1.007323,
+-1.171335, -1.414913, -2.051619, -1.143902,
+-1.291423, -1.328795, -3.117248, -0.876961,
+-1.262329, -1.538477, -1.925760, -1.031328,
+-1.744068, -1.562269, -1.549265, -0.908500,
+-1.189067, -1.438308, -2.101403, -1.090999,
+-1.403285, -1.363658, -3.037634, -0.797309,
+-1.280513, -1.493606, -2.068970, -0.990939,
+-1.740625, -1.503680, -1.477796, -0.983128,
+-1.209863, -1.505547, -2.069425, -1.039568,
+-1.474199, -1.230376, -2.996159, -0.846586,
+-1.419817, -1.521397, -1.871802, -0.951946,
+-1.802024, -1.527524, -1.514342, -0.921278,
+-1.227731, -1.440212, -2.175919, -1.030966,
+-1.528092, -1.297167, -2.938102, -0.783554,
+-1.463417, -1.532827, -1.880613, -0.915953,
+-2.005152, -1.487168, -1.552308, -0.849591,
+-1.275353, -1.398643, -2.207587, -1.011233,
+-1.666028, -1.256129, -3.130884, -0.728629,
+-1.596293, -1.485792, -1.913030, -0.859442,
+-2.064598, -1.512069, -1.531959, -0.828817,
+-1.424035, -1.384118, -2.168237, -0.930553,
+-1.730066, -1.186225, -2.971241, -0.763321,
+-1.693668, -1.501296, -1.913030, -0.808149,
+-2.219761, -1.492997, -1.580384, -0.774855,
+-1.501896, -1.410925, -2.214846, -0.857539,
+-1.711810, -1.214889, -3.039708, -0.744716,
+-1.892247, -1.423981, -1.964220, -0.758756,
+-2.206199, -1.488013, -1.595464, -0.773823,
+-1.417809, -1.367333, -2.659731, -0.837010,
+-1.915395, -1.184710, -3.073095, -0.691941,
+-2.070218, -1.426285, -1.978510, -0.702456,
+-2.328366, -1.427370, -1.648702, -0.754370,
+-1.414676, -1.470765, -2.909702, -0.749180,
+-2.055749, -1.215381, -3.166519, -0.628754,
+-2.138912, -1.429557, -2.075599, -0.659092,
+-2.351480, -1.377296, -1.696583, -0.756719,
+-1.545985, -1.414307, -3.174114, -0.689209,
+-2.158969, -1.218231, -3.244292, -0.598188,
+-2.360033, -1.374866, -2.107753, -0.632619,
+-2.498192, -1.422895, -1.640368, -0.728065,
+-1.572480, -1.467562, -3.496505, -0.631713,
+-2.098699, -1.160430, -3.206880, -0.647123,
+-2.223828, -1.477001, -2.209229, -0.591134,
+-2.623254, -1.423354, -1.709677, -0.682000,
+-1.654411, -1.480458, -3.908202, -0.577704,
+-2.366086, -1.163774, -3.307587, -0.584754,
+-2.532024, -1.372169, -2.332311, -0.562339,
+-2.568229, -1.418198, -1.795426, -0.663351,
+-1.770572, -1.487618, -4.441573, -0.524119,
+-2.399435, -1.207826, -3.559201, -0.541417,
+-2.614654, -1.537096, -2.121484, -0.524335,
+-2.722673, -1.399220, -1.826412, -0.641463,
+-1.749200, -1.357721, -4.321109, -0.587787,
+-2.401038, -1.181602, -3.566789, -0.554529,
+-2.232051, -1.496704, -2.198150, -0.583725,
+-2.594245, -1.367348, -1.836785, -0.671019,
+-1.672136, -1.365517, -5.421616, -0.593327,
+-2.215511, -1.177523, -3.662429, -0.584847,
+-2.479396, -1.445079, -2.178072, -0.567010,
+-2.504500, -1.284897, -1.664075, -0.793545,
+-1.789737, -1.323529, -4.816229, -0.582135,
+-2.029729, -1.142265, -3.478014, -0.656498,
+-2.493205, -1.514879, -2.286191, -0.517737,
+-2.438169, -1.180850, -1.805005, -0.818284,
+-1.571802, -1.290899, -5.028555, -0.671860,
+-2.062680, -1.030025, -3.701808, -0.710883,
+-2.230398, -1.303450, -2.186723, -0.676020,
+-2.509336, -1.168315, -1.880561, -0.786836,
+-1.803530, -1.374661, -4.448059, -0.560967,
+-2.353114, -0.874837, -3.901340, -0.759751,
+-2.401061, -1.480130, -2.301531, -0.541861,
+-2.902595, -1.173849, -2.203739, -0.643300,
+-1.440629, -1.268511, -4.158876, -0.762827,
+-2.306540, -0.910387, -3.742844, -0.745832,
+-1.924677, -1.367867, -2.315543, -0.691718,
+-2.796046, -1.628159, -2.518702, -0.412347,
+-0.709182, -1.565863, -2.505260, -1.526056,
+-1.065038, -1.104807, -2.495496, -1.420659,
+-1.176059, -1.723113, -1.148356, -1.630454,
+-1.817762, -1.496771, -1.271445, -1.098612,
+-2.519205, -0.344118, -5.635175, -1.574743,
+-2.977455, -0.464725, -6.435324, -1.142044,
+-3.092476, -0.304870, -5.161439, -1.552784,
+-3.035229, -0.562993, -5.197109, -0.975771,
+-0.000001, -15.545396, -15.545396, -15.545396,
+0.000000, -17.996645, -17.996645, -17.996645,
+-0.000007, -12.923923, -12.923923, -12.923923,
+0.000000, -17.165766, -17.165766, -17.165766,
+-18.420681, -18.420681, 0.000000, -18.420681,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.335601, -1.971125, -0.730849, -2.152442,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.384016, -1.765491, -1.670181, -0.941316,
+-1.355893, -1.392776, -2.624094, -0.864190,
+-1.345398, -1.730732, -1.729560, -0.954391,
+-2.123424, -1.682619, -0.996441, -1.123004,
+-1.065167, -1.683004, -1.656570, -1.277539,
+-1.150894, -1.393289, -2.468421, -1.047909,
+-1.235876, -1.477038, -1.500624, -1.354346,
+-1.750241, -1.494114, -1.103043, -1.309478,
+-1.182994, 0.000100, 0.000100, 0.000100,
+-1.609738, 0.000100, 0.000100, 0.000100,
+-1.565230, 0.000100, 0.000100, 0.000100,
+-1.256460, 0.000100, 0.000100, 0.000100,
+-1.125408, -1.759008, -1.359626, -1.400363,
+-0.989906, -1.446169, -2.882103, -1.087966,
+-1.195905, -1.576685, -1.415023, -1.394364,
+-1.479064, -1.637438, -1.351836, -1.142856,
+-1.101998, -1.756600, -1.417116, -1.375354,
+-1.013341, -1.409959, -2.810391, -1.100630,
+-1.191760, -1.602000, -1.376543, -1.417288,
+-1.488197, -1.623192, -1.353697, -1.143602,
+-1.107866, -1.754748, -1.411458, -1.374372,
+-1.009154, -1.441162, -2.820299, -1.081071,
+-1.186623, -1.611036, -1.379088, -1.413620,
+-1.470225, -1.619304, -1.388541, -1.131185,
+-1.099338, -1.791891, -1.394642, -1.376970,
+-0.992346, -1.443399, -2.900394, -1.084211,
+-1.176422, -1.587610, -1.414169, -1.410264,
+-1.467101, -1.614450, -1.374834, -1.147198,
+-1.127134, -1.762249, -1.353614, -1.402113,
+-0.993282, -1.411392, -2.980173, -1.093466,
+-1.191545, -1.588426, -1.380787, -1.424533,
+-1.487080, -1.626410, -1.341038, -1.152773,
+-1.082979, -1.785398, -1.407119, -1.390959,
+-0.975375, -1.470581, -2.791575, -1.103124,
+-1.156639, -1.669129, -1.382059, -1.402479,
+-1.472611, -1.612913, -1.404009, -1.121602,
+-1.108330, -1.739751, -1.381791, -1.413880,
+-0.999430, -1.447772, -2.865772, -1.079168,
+-1.196625, -1.583179, -1.386867, -1.416303,
+-1.479805, -1.620810, -1.377331, -1.132184,
+-1.093596, -1.760044, -1.405332, -1.395668,
+-1.016169, -1.421357, -2.823545, -1.086962,
+-1.170519, -1.563237, -1.402911, -1.450483,
+-1.476513, -1.626313, -1.360955, -1.144138,
+-1.115420, -1.786197, -1.388857, -1.365449,
+-1.003248, -1.435264, -2.831224, -1.089692,
+-1.245460, -1.567123, -1.366905, -1.391700,
+-1.509856, -1.617887, -1.350735, -1.134163,
+-1.094713, -1.788263, -1.394818, -1.385352,
+-0.978021, -1.424792, -2.860294, -1.120444,
+-1.157419, -1.625471, -1.416799, -1.400817,
+-1.486921, -1.590718, -1.382908, -1.141608,
+-1.119936, -1.746875, -1.372134, -1.403171,
+-0.989115, -1.409298, -2.807191, -1.128850,
+-1.192997, -1.592504, -1.378028, -1.422136,
+-1.513578, -1.623527, -1.331310, -1.143988,
+-1.079942, -1.778989, -1.407630, -1.398960,
+-0.979436, -1.446844, -2.832715, -1.107715,
+-1.157895, -1.650956, -1.369995, -1.427621,
+-1.455552, -1.632335, -1.374406, -1.144868,
+-1.133511, -1.771919, -1.336068, -1.405619,
+-1.010087, -1.407684, -2.901113, -1.090224,
+-1.196883, -1.583153, -1.356886, -1.447840,
+-1.490490, -1.628256, -1.352837, -1.139558,
+-1.093003, -1.771083, -1.409187, -1.385090,
+-0.976427, -1.416014, -2.877815, -1.125752,
+-1.148944, -1.581737, -1.421165, -1.444014,
+-1.496056, -1.624307, -1.361140, -1.131431,
+-1.112600, -1.790462, -1.360535, -1.394768,
+-1.015007, -1.423373, -2.828524, -1.085894,
+-1.182168, -1.578802, -1.408195, -1.416422,
+-1.481590, -1.641107, -1.348956, -1.141163,
+-1.098083, -1.767501, -1.372948, -1.417281,
+-0.971022, -1.461993, -2.833112, -1.106542,
+-1.140910, -1.607953, -1.400981, -1.453207,
+-1.467110, -1.634745, -1.334187, -1.167851,
+-1.135532, -1.768113, -1.320708, -1.422344,
+-0.994936, -1.451187, -2.793616, -1.094337,
+-1.178470, -1.664466, -1.323399, -1.440834,
+-1.461162, -1.677032, -1.315166, -1.162589,
+-1.079590, -1.764612, -1.391389, -1.425940,
+-0.957182, -1.451390, -2.859752, -1.125472,
+-1.110050, -1.684761, -1.382335, -1.452533,
+-1.458989, -1.660278, -1.336684, -1.156050,
+-1.113595, -1.796199, -1.317462, -1.435984,
+-0.982847, -1.447128, -2.786660, -1.112049,
+-1.155749, -1.650657, -1.341304, -1.462095,
+-1.480705, -1.615331, -1.352770, -1.154628,
+-1.072712, -1.752780, -1.401619, -1.433676,
+-0.982481, -1.385882, -2.812998, -1.153625,
+-1.126271, -1.646859, -1.382823, -1.459998,
+-1.464692, -1.599751, -1.375744, -1.157558,
+-1.163584, -1.710982, -1.311768, -1.437152,
+-1.042536, -1.434227, -2.937676, -1.032397,
+-1.204850, -1.548493, -1.379367, -1.444105,
+-1.479569, -1.601626, -1.391807, -1.132987,
+-1.049473, -1.928282, -1.260908, -1.509215,
+-0.905215, -1.496922, -2.818569, -1.164637,
+-1.076601, -1.832951, -1.279956, -1.508427,
+-1.381168, -1.719323, -1.269913, -1.242466,
+-1.104792, -1.399977, -1.367211, -1.787962,
+-1.091861, -0.939227, -2.731868, -1.568386,
+-1.217584, -1.262440, -1.355713, -1.811986,
+-1.512844, -1.240443, -1.308673, -1.512844,
+0.000000, -18.809204, -18.809204, -18.809204,
+0.000000, -18.796923, -18.796923, -18.796923,
+0.000000, -18.544844, -18.544844, -18.544844,
+0.000000, -18.354755, -18.354755, -18.354755,
+-20.030119, -20.030119, 0.000000, -20.030119,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.126271, -1.600280, -1.440370, -1.439358,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.037596, -1.784752, -1.368686, -1.498725,
+-0.969139, -1.430044, -2.783348, -1.141120,
+-1.132712, -1.548982, -1.416992, -1.500874,
+-1.447700, -1.632824, -1.309515, -1.205436,
+-1.040898, -1.799518, -1.386480, -1.463045,
+-0.963590, -1.488005, -2.767785, -1.109100,
+-1.055648, -1.635182, -1.463108, -1.488989,
+-1.426120, -1.636021, -1.327101, -1.204766},
+/*, acc[1][]=NULL */
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+/*, acc[2][]=..., */
+{-1.291405, 0.000100, 0.000100, 0.000100,
+-1.700561, 0.000100, 0.000100, 0.000100,
+-1.959603, 0.000100, 0.000100, 0.000100,
+-0.912248, 0.000100, 0.000100, 0.000100,
+-0.967584, -1.680312, -2.212070, -1.126362,
+-1.169191, -1.470572, -3.674711, -0.834178,
+-1.177629, -1.613553, -2.199279, -0.962518,
+-1.518591, -1.754980, -1.505519, -0.951485,
+-1.086813, -1.645696, -2.148010, -1.040960,
+-1.308940, -1.447298, -3.546280, -0.763876,
+-1.265289, -1.678134, -2.003556, -0.925671,
+-1.571073, -1.688395, -1.571073, -0.917442,
+-1.062553, -1.610518, -2.060189, -1.117141,
+-1.200552, -1.519616, -3.388064, -0.806535,
+-1.246441, -1.752376, -2.031960, -0.896406,
+-1.729768, -1.709483, -1.677860, -0.787586,
+-1.081087, -1.609438, -2.275335, -1.027192,
+-1.393842, -1.412191, -3.132488, -0.766446,
+-1.218572, -1.546259, -2.442345, -0.905480,
+-1.741168, -1.656826, -1.585875, -0.845896,
+-1.171108, -1.561647, -2.335763, -0.958547,
+-1.469615, -1.416190, -3.273641, -0.714410,
+-1.338974, -1.687280, -2.075045, -0.850239,
+-1.736700, -1.656657, -1.613640, -0.834798,
+-1.065460, -1.616957, -2.208524, -1.058240,
+-1.521840, -1.435366, -3.131275, -0.693147,
+-1.377484, -1.719233, -2.019337, -0.830479,
+-1.951956, -1.624963, -1.695415, -0.739061,
+-1.213263, -1.499465, -2.439708, -0.935631,
+-1.836858, -1.396302, -3.223149, -0.591778,
+-1.697051, -1.530925, -2.164647, -0.722265,
+-1.980191, -1.595281, -1.668161, -0.753934,
+-1.344525, -1.560121, -2.407418, -0.822850,
+-1.690918, -1.352393, -3.258094, -0.656685,
+-1.775868, -1.569074, -2.090948, -0.695439,
+-2.166453, -1.687134, -1.664220, -0.671328,
+-1.353219, -1.583235, -2.623252, -0.768482,
+-1.597032, -1.387682, -3.284799, -0.672556,
+-1.932635, -1.563728, -1.963887, -0.682073,
+-2.108326, -1.633550, -1.694919, -0.693729,
+-1.354988, -1.494407, -2.872856, -0.774074,
+-1.982379, -1.451752, -3.032200, -0.544917,
+-1.912903, -1.659907, -2.113574, -0.613621,
+-2.206587, -1.559456, -1.820713, -0.658237,
+-1.311331, -1.581877, -3.220870, -0.723545,
+-1.985723, -1.342838, -3.614959, -0.553912,
+-2.117759, -1.489152, -2.182298, -0.613684,
+-2.384744, -1.450649, -1.857291, -0.658988,
+-1.431551, -1.647774, -3.169239, -0.641412,
+-2.124789, -1.416297, -3.521443, -0.496965,
+-2.297163, -1.551374, -2.297163, -0.532805,
+-2.338303, -1.537525, -1.808344, -0.644984,
+-1.592830, -1.546845, -3.956028, -0.571650,
+-2.020222, -1.264100, -3.489894, -0.589911,
+-2.273973, -1.633471, -2.354016, -0.499458,
+-2.534957, -1.550822, -1.906790, -0.579631,
+-1.596192, -1.557478, -4.317469, -0.556289,
+-2.137505, -1.376090, -3.440415, -0.515109,
+-2.525726, -1.581267, -2.766887, -0.428588,
+-2.488501, -1.516974, -1.992601, -0.577588,
+-1.837481, -1.753398, -4.088758, -0.429495,
+-2.259703, -1.326276, -3.820934, -0.497174,
+-2.328062, -1.719000, -2.178531, -0.494194,
+-2.605688, -1.516126, -1.960993, -0.569394,
+-1.751568, -1.454837, -3.812979, -0.560405,
+-2.209058, -1.336571, -4.154961, -0.491408,
+-2.220963, -1.488077, -2.887439, -0.494345,
+-2.606632, -1.494775, -1.950841, -0.580240,
+-1.510231, -1.643762, -5.093701, -0.545151,
+-2.161962, -1.342704, -4.055498, -0.500157,
+-2.101079, -1.838716, -2.126397, -0.511846,
+-2.416615, -1.354232, -1.868299, -0.696662,
+-1.887591, -1.303644, -4.156260, -0.577398,
+-1.930570, -1.270760, -3.961997, -0.588263,
+-2.256815, -1.660296, -2.598563, -0.460682,
+-2.374103, -1.365642, -1.836065, -0.708785,
+-1.449473, -1.516914, -5.278066, -0.614676,
+-2.016032, -1.166316, -3.896340, -0.625510,
+-2.088124, -1.558865, -2.185762, -0.591830,
+-2.314120, -1.300012, -1.932888, -0.725901,
+-1.642227, -1.467874, -4.686726, -0.567714,
+-2.094544, -1.132366, -4.066882, -0.620877,
+-2.315006, -1.424035, -2.379544, -0.565808,
+-2.850366, -1.312024, -2.177022, -0.580669,
+-1.559647, -1.559647, -4.016367, -0.577035,
+-2.503954, -1.104922, -3.725166, -0.574666,
+-2.154163, -1.514127, -2.688243, -0.517516,
+-2.739277, -1.573107, -2.805969, -0.404163,
+-0.794456, -1.596236, -3.309207, -1.174511,
+-1.366235, -1.002111, -2.800384, -1.148715,
+-1.326396, -1.653608, -1.591088, -1.080264,
+-1.843199, -1.407881, -1.554784, -0.952452,
+-1.285303, -15.808850, -4.401274, -0.340842,
+-1.987766, -16.042294, -5.445635, -0.152355,
+-1.756041, -15.520259, -3.825004, -0.216349,
+-2.192417, -16.337231, -4.354296, -0.132958,
+-0.000001, -15.545396, -15.545396, -15.545396,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000007, -12.923923, -12.923923, -12.923923,
+0.000000, -17.165766, -17.165766, -17.165766,
+-17.358208, -17.358208, -0.000000, -17.358208,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.369082, -2.109996, -0.673095, -2.168981,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.298146, -1.822958, -1.974764, -0.851859,
+-1.422750, -1.310483, -2.670573, -0.867387,
+-1.219022, -1.825798, -1.941383, -0.916574,
+-2.265312, -1.674820, -1.142604, -0.941934,
+-1.012709, -1.623939, -1.822116, -1.280270,
+-1.110097, -1.455520, -2.486538, -1.038468,
+-1.173011, -1.572396, -1.536678, -1.317050,
+-1.656633, -1.537612, -1.155520, -1.274994,
+-1.168267, 0.000100, 0.000100, 0.000100,
+-1.631756, 0.000100, 0.000100, 0.000100,
+-1.573868, 0.000100, 0.000100, 0.000100,
+-1.250832, 0.000100, 0.000100, 0.000100,
+-1.111770, -1.773410, -1.365808, -1.402031,
+-0.973449, -1.482240, -2.922036, -1.075082,
+-1.161273, -1.627716, -1.417134, -1.393813,
+-1.445909, -1.669680, -1.354373, -1.145541,
+-1.075371, -1.771473, -1.449553, -1.370140,
+-0.999244, -1.427754, -2.860510, -1.094354,
+-1.185072, -1.635087, -1.382345, -1.392911,
+-1.482870, -1.641964, -1.374121, -1.119775,
+-1.079347, -1.812344, -1.424155, -1.361797,
+-1.017537, -1.450534, -2.866831, -1.057877,
+-1.168096, -1.626979, -1.394279, -1.408510,
+-1.444919, -1.660942, -1.419156, -1.101547,
+-1.087674, -1.815971, -1.415212, -1.356943,
+-0.970003, -1.459837, -2.943851, -1.090700,
+-1.154502, -1.616565, -1.421600, -1.406993,
+-1.446631, -1.631406, -1.389858, -1.139831,
+-1.113159, -1.795581, -1.369542, -1.381402,
+-0.990275, -1.436488, -2.993251, -1.076929,
+-1.183282, -1.610932, -1.389830, -1.406868,
+-1.452787, -1.683915, -1.354399, -1.132166,
+-1.044126, -1.802180, -1.432127, -1.409516,
+-0.970040, -1.499470, -2.851557, -1.079017,
+-1.137859, -1.667996, -1.409281, -1.400182,
+-1.456283, -1.627055, -1.435045, -1.101757,
+-1.073756, -1.765077, -1.402217, -1.422738,
+-0.985188, -1.478821, -2.870988, -1.072705,
+-1.172032, -1.608887, -1.414934, -1.397512,
+-1.443257, -1.630396, -1.377141, -1.152989,
+-1.078649, -1.802718, -1.432200, -1.361358,
+-1.010697, -1.421944, -2.911135, -1.077704,
+-1.147924, -1.603707, -1.401648, -1.446618,
+-1.451591, -1.633146, -1.398936, -1.128157,
+-1.102901, -1.824149, -1.402265, -1.344388,
+-0.994649, -1.447135, -2.884916, -1.081625,
+-1.208947, -1.618078, -1.375294, -1.384521,
+-1.471655, -1.654233, -1.370798, -1.122971,
+-1.074595, -1.824419, -1.411667, -1.372329,
+-0.967159, -1.435231, -2.854151, -1.126477,
+-1.115361, -1.678326, -1.427803, -1.403719,
+-1.469558, -1.636727, -1.378605, -1.128781,
+-1.091165, -1.796996, -1.391712, -1.387376,
+-0.969186, -1.438723, -2.862422, -1.120099,
+-1.181555, -1.652788, -1.343196, -1.424314,
+-1.488599, -1.680414, -1.352228, -1.110630,
+-1.056168, -1.798131, -1.428698, -1.398427,
+-0.987511, -1.448104, -2.857329, -1.093446,
+-1.121318, -1.706241, -1.363458, -1.440419,
+-1.443511, -1.654574, -1.390857, -1.127422,
+-1.100401, -1.813071, -1.355328, -1.401419,
+-0.986103, -1.439171, -2.909912, -1.092528,
+-1.172422, -1.609037, -1.365252, -1.448224,
+-1.449685, -1.688651, -1.353488, -1.132428,
+-1.074954, -1.801715, -1.418167, -1.380259,
+-0.962150, -1.469246, -2.912321, -1.098169,
+-1.111198, -1.657777, -1.414539, -1.438701,
+-1.477395, -1.662611, -1.376644, -1.109569,
+-1.094225, -1.846724, -1.378073, -1.364987,
+-0.978258, -1.476389, -2.828175, -1.089298,
+-1.153759, -1.627530, -1.406723, -1.413903,
+-1.448350, -1.696252, -1.352353, -1.129973,
+-1.073884, -1.808097, -1.371115, -1.424904,
+-0.950051, -1.464272, -2.848606, -1.126679,
+-1.100838, -1.642816, -1.418457, -1.461510,
+-1.443026, -1.673257, -1.327045, -1.168306,
+-1.110764, -1.808777, -1.322644, -1.425416,
+-0.962499, -1.501774, -2.804179, -1.094307,
+-1.158218, -1.711567, -1.318364, -1.436328,
+-1.432805, -1.716330, -1.335186, -1.144046,
+-1.065682, -1.805462, -1.395958, -1.412585,
+-0.951203, -1.458707, -2.975128, -1.108219,
+-1.079623, -1.707178, -1.404003, -1.455483,
+-1.433065, -1.684137, -1.377037, -1.128576,
+-1.110801, -1.830148, -1.312776, -1.421950,
+-0.975952, -1.456606, -2.835787, -1.104223,
+-1.152923, -1.698855, -1.316655, -1.455154,
+-1.475301, -1.645854, -1.360270, -1.133668,
+-1.034556, -1.830539, -1.399241, -1.437568,
+-0.991000, -1.386595, -2.834266, -1.139096,
+-1.118023, -1.705123, -1.348929, -1.461853,
+-1.435975, -1.648962, -1.390248, -1.136769,
+-1.121783, -1.757168, -1.324235, -1.444931,
+-1.010234, -1.441611, -2.916736, -1.063603,
+-1.158253, -1.598600, -1.373899, -1.466775,
+-1.430681, -1.670595, -1.377608, -1.137755,
+-1.002533, -2.027449, -1.274953, -1.505366,
+-0.829737, -1.672629, -2.807267, -1.152920,
+-0.998731, -2.009633, -1.294611, -1.497846,
+-1.332030, -1.862735, -1.236302, -1.236650,
+-0.821629, -18.641743, -1.084048, -1.504799,
+-0.596039, -17.786745, -2.236046, -1.072564,
+-0.884956, -18.252380, -1.023085, -1.479359,
+-1.171400, -18.337517, -0.967229, -1.171400,
+0.000000, -18.809204, -18.809204, -18.809204,
+-1.386294, -1.386294, -1.386294, -1.386294,
+0.000000, -18.544844, -18.544844, -18.544844,
+0.000000, -18.354755, -18.354755, -18.354755,
+-19.685711, -19.685711, 0.000000, -19.685711,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.096132, -1.643951, -1.448112, -1.437124,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.015896, -1.790409, -1.394095, -1.500705,
+-0.954781, -1.454286, -2.837458, -1.130220,
+-1.087743, -1.586525, -1.420871, -1.528389,
+-1.428880, -1.643753, -1.341125, -1.185418,
+-0.999478, -1.818946, -1.409813, -1.489245,
+-0.948067, -1.487705, -2.823988, -1.117026,
+-1.013357, -1.654263, -1.478194, -1.524548,
+-1.410640, -1.659420, -1.332923, -1.197135},
+/*, acc[3][]=acc[4][]=acc[5][]=NULL, */
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+/*, acc[6][]=..., */
+{-1.461211, 0.000100, 0.000100, 0.000100,
+-1.349827, 0.000100, 0.000100, 0.000100,
+-1.795269, 0.000100, 0.000100, 0.000100,
+-1.070967, 0.000100, 0.000100, 0.000100,
+-1.083620, -1.371302, -1.951970, -1.324782,
+-1.278774, -1.349155, -2.969060, -0.889623,
+-1.219565, -1.390523, -1.661397, -1.324926,
+-1.659763, -1.473864, -1.356081, -1.129768,
+-1.309922, -1.402295, -1.874451, -1.106581,
+-1.328187, -1.265666, -3.027568, -0.904960,
+-1.425009, -1.357186, -1.639418, -1.177601,
+-1.763950, -1.397200, -1.469959, -1.045802,
+-1.177146, -1.359467, -1.995455, -1.206999,
+-1.428025, -1.177974, -3.266299, -0.881482,
+-1.553749, -1.250563, -1.711377, -1.134491,
+-1.849152, -1.465010, -1.431108, -0.987492,
+-1.215607, -1.340061, -1.887700, -1.237113,
+-1.506719, -1.228006, -2.679437, -0.874942,
+-1.386294, -1.501807, -1.897119, -0.974788,
+-1.945910, -1.245937, -1.346020, -1.173775,
+-1.406097, -1.475090, -1.772341, -1.032238,
+-1.571697, -1.121960, -3.075770, -0.866281,
+-1.588818, -1.473306, -1.673976, -0.969780,
+-1.947948, -1.542484, -1.336632, -0.965336,
+-1.427116, -1.382665, -1.979184, -0.991799,
+-1.658228, -1.195605, -2.443747, -0.867101,
+-1.585145, -1.354034, -1.941819, -0.932821,
+-2.077627, -1.491453, -1.391754, -0.913572,
+-1.490338, -1.229325, -2.036880, -1.044754,
+-1.677646, -1.143564, -3.124560, -0.797288,
+-1.656155, -1.459445, -2.019060, -0.811969,
+-2.026392, -1.531696, -1.313443, -0.959386,
+-1.497998, -1.394458, -2.138034, -0.890212,
+-1.885444, -1.050985, -3.020420, -0.798809,
+-1.791759, -1.466337, -2.021333, -0.754842,
+-2.447550, -1.355628, -1.547389, -0.814397,
+-2.207272, -1.123931, -1.931020, -0.867501,
+-1.799422, -1.113997, -2.882764, -0.797660,
+-2.125249, -1.192431, -2.212260, -0.760011,
+-2.260815, -1.473737, -1.381566, -0.878317,
+-1.565231, -1.369487, -2.611195, -0.769432,
+-1.771956, -1.093625, -3.047021, -0.804079,
+-2.045538, -1.287854, -2.079440, -0.755390,
+-2.401136, -1.314501, -1.558954, -0.842992,
+-1.563394, -1.301030, -2.613211, -0.809623,
+-2.185072, -1.086462, -3.180497, -0.676178,
+-2.074218, -1.423632, -2.033396, -0.687926,
+-2.503953, -1.218757, -1.611011, -0.860487,
+-1.686398, -1.280934, -3.295823, -0.693148,
+-2.360852, -1.262242, -3.159357, -0.544402,
+-2.644531, -1.297463, -2.265044, -0.594367,
+-2.878285, -1.426610, -1.677020, -0.660260,
+-1.696448, -1.491655, -2.708041, -0.644359,
+-2.114915, -1.085297, -2.925842, -0.717572,
+-2.547033, -1.556639, -2.167546, -0.516869,
+-2.772587, -1.218840, -1.606837, -0.817790,
+-2.143976, -1.067842, -4.158836, -0.647339,
+-2.809400, -1.086636, -3.463322, -0.560219,
+-2.595250, -1.371479, -2.212260, -0.575918,
+-2.812921, -1.193776, -1.628655, -0.819385,
+-1.619908, -1.428854, -4.564252, -0.594059,
+-2.427747, -1.060872, -3.200933, -0.644573,
+-2.369071, -1.530745, -1.922786, -0.609065,
+-2.726320, -1.196572, -1.678630, -0.808149,
+-1.851350, -1.212272, -4.102597, -0.636909,
+-2.480264, -1.046506, -3.019259, -0.661108,
+-2.542721, -1.397594, -1.814487, -0.670925,
+-2.437344, -1.201112, -1.769095, -0.818120,
+-1.923093, -1.083346, -14.077878, -0.662843,
+-2.425482, -1.154571, -4.143121, -0.543868,
+-2.743762, -1.291516, -2.576709, -0.536495,
+-2.840538, -1.226114, -1.701105, -0.764228,
+-1.746907, -1.225613, -13.901692, -0.630907,
+-1.907224, -0.954121, -3.427044, -0.835000,
+-2.779501, -1.365816, -1.980999, -0.607287,
+-2.583996, -0.996221, -1.789067, -0.946389,
+-1.819157, -1.026921, -4.304018, -0.763107,
+-2.340626, -0.980175, -3.667492, -0.687271,
+-2.354541, -1.130770, -2.290003, -0.731863,
+-3.015532, -1.005086, -1.997891, -0.799961,
+-1.033016, -0.439953, -13.981029, -13.981029,
+-2.003408, -0.199815, -3.073847, -15.462245,
+-1.691675, -0.487705, -1.600703, -13.946543,
+-2.296685, -0.398096, -1.479240, -15.726534,
+-14.513647, -14.513647, -14.513647, -0.000002,
+-16.142788, -16.142788, -16.142788, -0.000000,
+-14.513647, -14.513647, -14.513647, -0.000002,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.816592, -1.562886, -0.878135, -1.549597,
+-14.657080, -0.000001, -14.657080, -14.657080,
+-14.910785, -0.000001, -14.910785, -14.910785,
+-15.595535, -0.000001, -15.595535, -15.595535,
+-14.924074, -0.000001, -14.924074, -14.924074,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000000, -16.473671, -16.473671, -16.473671,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.473671, -16.473671, -0.000000, -16.473671,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.501278, -1.725995, -0.717980, -2.194425,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.527945, -1.736699, -1.273415, -1.117661,
+-1.442990, -1.347680, -2.446289, -0.873896,
+-1.470176, -1.774665, -1.610876, -0.914138,
+-2.583992, -1.810806, -0.849398, -1.098613,
+-1.034282, -1.775284, -1.376377, -1.502417,
+-1.226788, -1.320817, -2.538971, -1.019149,
+-1.273415, -1.343211, -1.513556, -1.431318,
+-1.951460, -1.541676, -0.986998, -1.304833,
+-1.262810, 0.000100, 0.000100, 0.000100,
+-1.579093, 0.000100, 0.000100, 0.000100,
+-1.512966, 0.000100, 0.000100, 0.000100,
+-1.235380, 0.000100, 0.000100, 0.000100,
+-1.136397, -1.811655, -1.413307, -1.300829,
+-0.980300, -1.453699, -2.860610, -1.097024,
+-1.386294, -1.416508, -1.422662, -1.322873,
+-1.525155, -1.540821, -1.389682, -1.142398,
+-1.212399, -1.736128, -1.343735, -1.326343,
+-1.140020, -1.315535, -2.907679, -1.029308,
+-1.290840, -1.460525, -1.467397, -1.338186,
+-1.487132, -1.552647, -1.346311, -1.197014,
+-1.294549, -1.618973, -1.389859, -1.277382,
+-1.009345, -1.423108, -2.606460, -1.136990,
+-1.184495, -1.562424, -1.450419, -1.386294,
+-1.489189, -1.544899, -1.399717, -1.156948,
+-1.081576, -1.859280, -1.401298, -1.351401,
+-0.998311, -1.446155, -2.986597, -1.062572,
+-1.228534, -1.573692, -1.349023, -1.425009,
+-1.432174, -1.606080, -1.377366, -1.176695,
+-1.220146, -1.728559, -1.242125, -1.433375,
+-0.954512, -1.399717, -2.995730, -1.144656,
+-1.338776, -1.551495, -1.269384, -1.406914,
+-1.654821, -1.422199, -1.332048, -1.191536,
+-1.148531, -1.815799, -1.478285, -1.230448,
+-0.976301, -1.483119, -2.665111, -1.118476,
+-1.186775, -1.714642, -1.454685, -1.268355,
+-1.520338, -1.570769, -1.411704, -1.109423,
+-1.232768, -1.684218, -1.444738, -1.247862,
+-1.055416, -1.458445, -2.794905, -1.026568,
+-1.189819, -1.542640, -1.373222, -1.475701,
+-1.525630, -1.581060, -1.473112, -1.056853,
+-1.112648, -1.689190, -1.346448, -1.485095,
+-1.065823, -1.408767, -2.708048, -1.065823,
+-1.337581, -1.394378, -1.414051, -1.400893,
+-1.541510, -1.680029, -1.261319, -1.151083,
+-1.152101, -1.637609, -1.400281, -1.414465,
+-0.947136, -1.597054, -2.751014, -1.061911,
+-1.284748, -1.515859, -1.483070, -1.284748,
+-1.728817, -1.600200, -1.257997, -1.089374,
+-1.091692, -1.777966, -1.419238, -1.372501,
+-1.090606, -1.314946, -2.936430, -1.071648,
+-1.312623, -1.479161, -1.426862, -1.335613,
+-1.453251, -1.425471, -1.511238, -1.186922,
+-1.112218, -1.590475, -1.465962, -1.441864,
+-1.063977, -1.336177, -2.446024, -1.186024,
+-1.144656, -1.469676, -1.537117, -1.441103,
+-1.545750, -1.524697, -1.309921, -1.206243,
+-1.118680, -1.805093, -1.419968, -1.320102,
+-0.983732, -1.430746, -2.925771, -1.098612,
+-1.243299, -1.416868, -1.403881, -1.498546,
+-1.446312, -1.608225, -1.461897, -1.101043,
+-1.255455, -1.664701, -1.346247, -1.325796,
+-1.106555, -1.223143, -3.081591, -1.111351,
+-1.300391, -1.473004, -1.396311, -1.382978,
+-1.583838, -1.567489, -1.380673, -1.094113,
+-1.133355, -1.692024, -1.376586, -1.421037,
+-0.980343, -1.266729, -2.791747, -1.266729,
+-1.307089, -1.336246, -1.442264, -1.468932,
+-1.512418, -1.590787, -1.260545, -1.229774,
+-1.146732, -1.720532, -1.397543, -1.361984,
+-1.074515, -1.321039, -2.857904, -1.095747,
+-1.256575, -1.357785, -1.450731, -1.497251,
+-1.606950, -1.512205, -1.405151, -1.097368,
+-1.211665, -1.650499, -1.402517, -1.330414,
+-0.963637, -1.549153, -2.826012, -1.058947,
+-1.354137, -1.419520, -1.379780, -1.392852,
+-1.498392, -1.567021, -1.342785, -1.181796,
+-1.200558, -1.670561, -1.338428, -1.392012,
+-1.170933, -1.394076, -2.755677, -0.972082,
+-1.211314, -1.567989, -1.414913, -1.383165,
+-1.550597, -1.729749, -1.119021, -1.258955,
+-1.043235, -1.706529, -1.394633, -1.522031,
+-0.878070, -1.525407, -2.812327, -1.181500,
+-1.213784, -1.506172, -1.338634, -1.518830,
+-1.436686, -1.632560, -1.289427, -1.232960,
+-1.040770, -1.705261, -1.543873, -1.380153,
+-0.999919, -1.512214, -2.533047, -1.101859,
+-1.121085, -1.639515, -1.373049, -1.484275,
+-1.500898, -1.518189, -1.478298, -1.108856,
+-1.044218, -1.003867, -1.267362, -16.083504,
+-0.784893, -0.743075, -2.685576, -15.633588,
+-1.156255, -0.863352, -1.333355, -15.618870,
+-1.237794, -0.957137, -1.120901, -15.969596,
+-16.178249, -16.178249, -16.178249, -0.000000,
+-16.328358, -16.328358, -16.328358, -0.000000,
+-15.837059, -15.837059, -15.837059, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.534020, -1.470717, -1.241033, -1.326190,
+-15.699546, -0.000001, -15.699546, -15.699546,
+-15.762849, -0.000001, -15.762849, -15.762849,
+-15.992533, -0.000000, -15.992533, -15.992533,
+-15.907375, -0.000000, -15.907375, -15.907375,
+-1.386294, -1.386294, -1.386294, -1.386294,
+0.000000, -17.233564, -17.233564, -17.233564,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-17.233564, -17.233564, -0.000000, -17.233564,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.206489, -1.518937, -1.495267, -1.356268,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.002605, -1.826303, -1.331939, -1.569712,
+-1.061871, -1.181278, -3.070298, -1.200983,
+-1.204948, -1.558145, -1.380463, -1.434208,
+-1.378690, -1.604362, -1.291678, -1.300981,
+-1.098612, -1.788668, -1.403994, -1.370933,
+-1.049386, -1.541465, -2.494122, -1.040690,
+-1.266493, -1.680055, -1.423336, -1.234745,
+-1.600364, -1.479004, -1.391516, -1.134402},
+/*, acc[7][]=..., */
+{-1.247372, 0.000100, 0.000100, 0.000100,
+-1.685627, 0.000100, 0.000100, 0.000100,
+-1.828728, 0.000100, 0.000100, 0.000100,
+-1.002951, 0.000100, 0.000100, 0.000100,
+-1.148209, -1.562185, -1.893541, -1.131402,
+-1.217959, -1.473306, -3.871183, -0.789292,
+-1.163151, -1.576338, -2.079440, -1.033474,
+-1.481184, -1.720663, -1.462834, -1.015821,
+-1.079460, -1.522196, -1.979620, -1.191164,
+-1.145680, -1.521469, -2.999565, -0.882390,
+-1.156070, -1.672285, -2.105148, -0.979139,
+-1.435927, -1.845826, -1.518924, -0.953829,
+-1.138779, -1.598994, -2.158608, -1.015546,
+-1.094589, -1.524444, -2.680211, -0.970134,
+-1.209223, -1.614687, -2.256538, -0.921541,
+-1.558144, -1.652455, -1.578347, -0.937568,
+-1.316933, -1.348349, -2.084981, -1.055363,
+-1.083345, -1.897119, -3.363447, -0.740401,
+-1.316185, -1.690877, -2.142861, -0.843581,
+-1.415044, -2.012880, -1.539096, -0.894268,
+-1.038155, -1.723333, -1.995266, -1.104294,
+-1.242045, -1.386294, -2.807675, -0.914138,
+-1.560910, -1.866291, -1.732760, -0.779657,
+-1.706640, -1.626597, -1.508814, -0.914402,
+-1.380178, -1.628014, -2.123334, -0.838138,
+-1.270710, -1.526056, -3.230796, -0.771216,
+-1.480657, -1.590658, -1.740939, -0.933018,
+-1.793741, -1.551729, -1.542427, -0.896683,
+-1.163853, -1.781288, -2.251290, -0.881805,
+-1.435828, -1.420324, -3.029756, -0.750525,
+-1.483668, -1.483668, -1.866660, -0.937126,
+-1.960643, -1.660058, -1.533764, -0.791020,
+-1.336462, -1.535590, -1.949565, -0.968738,
+-1.405637, -1.345013, -2.999565, -0.810931,
+-1.563394, -1.851075, -1.956435, -0.708980,
+-1.901229, -1.769169, -1.573196, -0.749169,
+-1.453646, -1.632694, -2.232314, -0.768731,
+-1.260668, -1.410200, -3.234741, -0.836855,
+-1.594324, -1.476541, -1.950998, -0.852388,
+-2.064121, -1.547905, -1.614597, -0.773476,
+-1.350505, -1.350505, -2.736795, -0.874660,
+-1.531033, -1.311174, -3.444673, -0.729253,
+-1.945909, -1.648658, -1.871801, -0.671408,
+-2.399608, -1.678553, -1.582409, -0.659474,
+-1.446919, -1.511457, -2.920218, -0.712951,
+-1.717148, -1.358515, -2.704531, -0.700215,
+-1.785995, -1.515705, -2.588337, -0.620693,
+-2.095340, -1.639864, -1.631767, -0.718708,
+-1.521213, -1.436056, -3.248424, -0.683486,
+-2.197223, -1.109424, -3.066256, -0.668368,
+-2.409939, -1.454432, -2.073469, -0.596207,
+-2.217649, -1.738757, -1.435707, -0.739297,
+-1.443818, -1.615668, -5.081307, -0.581596,
+-1.654820, -1.323464, -2.789797, -0.731413,
+-1.977161, -1.444358, -2.014901, -0.708652,
+-2.343664, -1.683308, -1.377049, -0.763690,
+-1.571900, -1.743749, -3.823169, -0.518140,
+-2.253103, -1.302913, -3.222498, -0.538997,
+-2.293013, -1.425515, -2.341803, -0.575365,
+-2.165873, -1.671855, -1.706341, -0.661797,
+-1.897118, -1.163151, -5.075076, -0.632524,
+-2.215572, -1.382665, -3.824996, -0.480974,
+-2.701354, -1.420427, -2.701354, -0.471348,
+-2.596379, -1.497768, -1.848173, -0.608248,
+-13.972518, -13.972518, -13.972518, -0.000003,
+-14.959735, -14.959735, -14.959735, -0.000001,
+-14.070156, -14.070156, -14.070156, -0.000002,
+-15.810211, -15.810211, -15.810211, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-2.370475, -1.358124, -1.420063, -0.897170,
+-1.617736, -1.751267, -14.006134, -0.465059,
+-2.812407, -1.490654, -4.421824, -0.352822,
+-2.801760, -1.651858, -2.801760, -0.375566,
+-2.191558, -1.334622, -1.850255, -0.759708,
+-1.436725, -1.712977, -14.014364, -0.541343,
+-1.512588, -1.304949, -3.895200, -0.717163,
+-2.024378, -1.485385, -2.265538, -0.620390,
+-2.538110, -1.143653, -1.618546, -0.905982,
+-14.340241, -14.340241, -14.340241, -0.000002,
+-15.129235, -15.129235, -15.129235, -0.000001,
+-14.346141, -14.346141, -14.346141, -0.000002,
+-15.580242, -15.580242, -15.580242, -0.000001,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.376606, -16.376606, -16.376606, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.759094, -1.571555, -1.452533, -0.951658,
+-14.617514, -0.000001, -14.617514, -14.617514,
+-14.805053, -0.000001, -14.805053, -14.805053,
+-14.924074, -0.000001, -14.924074, -14.924074,
+-15.424950, -0.000001, -15.424950, -15.424950,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000000, -16.376606, -16.376606, -16.376606,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.376606, -16.376606, -0.000000, -16.376606,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.081767, -1.962259, -0.986249, -1.913992,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.280478, -1.850392, -1.702472, -0.960536,
+-1.332806, -1.540445, -2.495952, -0.821981,
+-1.274742, -1.714108, -1.636722, -1.062023,
+-1.786536, -1.491073, -1.157929, -1.226922,
+-1.040960, -1.759639, -1.759639, -1.195110,
+-0.972462, -1.622123, -2.699677, -1.029620,
+-1.063106, -1.667305, -1.566209, -1.358570,
+-1.850793, -1.427470, -1.157646, -1.242203,
+-1.340886, 0.000100, 0.000100, 0.000100,
+-1.594874, 0.000100, 0.000100, 0.000100,
+-1.854831, 0.000100, 0.000100, 0.000100,
+-0.970290, 0.000100, 0.000100, 0.000100,
+-1.207094, -1.782455, -1.728388, -1.035244,
+-1.051546, -1.585626, -2.809387, -0.953106,
+-1.067842, -1.760985, -1.386294, -1.450832,
+-1.379864, -1.459906, -1.609437, -1.151605,
+-1.370034, -1.759497, -1.585145, -0.997360,
+-1.074516, -1.698667, -3.020405, -0.851373,
+-1.004585, -2.316761, -1.266948, -1.372308,
+-1.463852, -2.189786, -1.342492, -0.927549,
+-1.064712, -1.809149, -1.352393, -1.457753,
+-1.015923, -1.575535, -2.450992, -1.064713,
+-1.339775, -1.945907, -1.134981, -1.295323,
+-1.721442, -1.616082, -1.551543, -0.890146,
+-1.116962, -1.756040, -1.564985, -1.234745,
+-1.154966, -1.582408, -3.191819, -0.824726,
+-0.987948, -1.770704, -1.598855, -1.365241,
+-1.448815, -1.791758, -1.481604, -0.990983,
+-0.999673, -1.609437, -1.737270, -1.362578,
+-1.018571, -1.443452, -3.583474, -0.980831,
+-0.985285, -1.609437, -1.609437, -1.484274,
+-1.518783, -1.975540, -1.336462, -0.968738,
+-1.083346, -1.732039, -1.823010, -1.129866,
+-1.286211, -1.286211, -2.721281, -0.963439,
+-1.181995, -1.427116, -1.832578, -1.226446,
+-1.418043, -1.856296, -1.296682, -1.114361,
+-1.141538, -1.483286, -1.601069, -1.377926,
+-1.074516, -1.698667, -2.797266, -0.880361,
+-1.400088, -1.517870, -1.400088, -1.245938,
+-1.471287, -2.072058, -1.439539, -0.897943,
+-1.218572, -1.486835, -1.412727, -1.449095,
+-0.985285, -1.373049, -3.624296, -1.059393,
+-1.037245, -1.373716, -1.661396, -1.596858,
+-1.683545, -1.386294, -1.278081, -1.252763,
+-1.227230, -1.920374, -1.534714, -1.064712,
+-0.941610, -1.832579, -2.813398, -0.941610,
+-1.386294, -1.791757, -1.386294, -1.098614,
+-1.225613, -1.918757, -1.555853, -1.053763,
+-1.147403, -1.791758, -1.402295, -1.309922,
+-1.052094, -1.578184, -2.756822, -0.965083,
+-1.272966, -1.919589, -1.484274, -1.059393,
+-1.510998, -1.510998, -1.609437, -1.025491,
+-1.007264, -2.105871, -1.343735, -1.377636,
+-1.510591, -1.011603, -2.734353, -1.047970,
+-1.303407, -1.686397, -1.450010, -1.175574,
+-1.580450, -1.511457, -1.357307, -1.151455,
+-1.107830, -2.052287, -1.472472, -1.164988,
+-0.910562, -1.575535, -2.856455, -1.098613,
+-1.087440, -1.716046, -1.655422, -1.230541,
+-1.293921, -1.562184, -1.562184, -1.182696,
+-1.167606, -1.909541, -1.378914, -1.241714,
+-1.413693, -1.008230, -2.917751, -1.085191,
+-1.517870, -1.517870, -1.892560, -0.889264,
+-1.899746, -1.666132, -1.548350, -0.801137,
+-1.088142, -1.988924, -1.988924, -0.942961,
+-1.207812, -1.632694, -2.674137, -0.828324,
+-1.111859, -1.497519, -1.622682, -1.386294,
+-1.721442, -1.685074, -1.461932, -0.906407,
+-1.404643, -1.481604, -1.442384, -1.234745,
+-1.544899, -0.820983, -4.317393, -1.098614,
+-1.526055, -1.189585, -1.931517, -1.098614,
+-2.098984, -1.379864, -1.459906, -0.932552,
+-13.554151, -13.554151, -13.554151, -0.000004,
+-13.981029, -13.981029, -13.981029, -0.000003,
+-13.500805, -13.500805, -13.500805, -0.000004,
+-14.159103, -14.159103, -14.159103, -0.000002,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.619266, -1.657006, -1.260125, -1.115876,
+-1.098614, -1.450010, -1.686397, -1.398717,
+-1.137834, -1.717649, -2.747256, -0.830350,
+-1.257083, -1.809149, -1.534714, -1.090029,
+-1.639742, -1.762344, -1.401332, -0.946597,
+-1.308333, -1.765090, -1.765090, -0.948331,
+-1.021653, -1.272966, -4.317393, -1.059393,
+-1.523495, -1.523495, -1.265667, -1.265667,
+-1.365816, -1.480226, -1.757857, -1.064711,
+-13.919874, -13.919874, -13.919874, -0.000003,
+-13.710155, -13.710155, -13.710155, -0.000003,
+-13.415039, -13.415039, -13.415039, -0.000004,
+-14.159103, -14.159103, -14.159103, -0.000002,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.224056, -15.224056, -15.224056, -0.000001,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.502856, -1.418595, -1.723255, -1.030109,
+-13.721204, -0.000003, -13.721204, -13.721204,
+-13.805464, -0.000003, -13.805464, -13.805464,
+-13.500805, -0.000004, -13.500805, -13.500805,
+-14.193950, -0.000002, -14.193950, -14.193950,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000001, -15.224056, -15.224056, -15.224056,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.224056, -15.224056, -0.000001, -15.224056,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.313235, -1.536378, -1.481116, -1.243031,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.871840, -2.061419, -1.481604, -1.481604,
+-0.813777, -1.386294, -4.477242, -1.219241,
+-1.200396, -1.699385, -1.760009, -1.066865,
+-1.726161, -1.880311, -1.159767, -1.033016,
+-1.136640, -2.007466, -1.496642, -1.136640,
+-1.127187, -1.318241, -4.262586, -0.930477,
+-1.233954, -1.884538, -1.661396, -1.002154,
+-1.301953, -1.650259, -1.570217, -1.114742},
+/*, acc[8][]=..., */
+{-1.308333, 0.000100, 0.000100, 0.000100,
+-1.559026, 0.000100, 0.000100, 0.000100,
+-1.783567, 0.000100, 0.000100, 0.000100,
+-1.045969, 0.000100, 0.000100, 0.000100,
+-1.031361, -1.546259, -2.070782, -1.189584,
+-1.295566, -1.380723, -2.789484, -0.883322,
+-1.112698, -1.630639, -1.667007, -1.249273,
+-1.396379, -1.693110, -1.557309, -1.027615,
+-1.029620, -1.402295, -1.974080, -1.355042,
+-1.347508, -1.649788, -2.468095, -0.769432,
+-1.535329, -1.284016, -2.034318, -0.975715,
+-1.487099, -1.332948, -1.457246, -1.282305,
+-1.147620, -1.263692, -2.142241, -1.263692,
+-1.137642, -1.731416, -2.451959, -0.876427,
+-1.121086, -1.686398, -1.814231, -1.121086,
+-1.546480, -1.616684, -1.280212, -1.169670,
+-1.230382, -1.349927, -2.091862, -1.123614,
+-1.233715, -1.419817, -2.639052, -0.927341,
+-1.208614, -1.711716, -1.974079, -0.962481,
+-1.634573, -1.880706, -1.285198, -0.978468,
+-1.098613, -1.574036, -2.106839, -1.085190,
+-1.432814, -1.278664, -2.462429, -0.921990,
+-1.187560, -1.514772, -1.904236, -1.120119,
+-1.735669, -1.615868, -1.438938, -0.947213,
+-1.434150, -1.191589, -2.107877, -1.089310,
+-1.547562, -1.365241, -2.933849, -0.736633,
+-1.393620, -1.552685, -1.741926, -1.007959,
+-1.958813, -1.553348, -1.568616, -0.823023,
+-1.198074, -1.368699, -2.038854, -1.159607,
+-1.576338, -1.405712, -2.939636, -0.702810,
+-1.461018, -1.301953, -1.737270, -1.139435,
+-1.923245, -1.620965, -1.448353, -0.864640,
+-1.330414, -1.445483, -2.325838, -0.910561,
+-1.786695, -1.222761, -2.644141, -0.761416,
+-1.364633, -1.552685, -2.086765, -0.894630,
+-2.200143, -1.506997, -1.443281, -0.840519,
+-1.132061, -1.440361, -2.190664, -1.111858,
+-1.734600, -1.084014, -2.679057, -0.875470,
+-1.768769, -1.425825, -2.026596, -0.782276,
+-2.164963, -1.508184, -1.654138, -0.749342,
+-1.507438, -1.242747, -2.519034, -0.893073,
+-2.121799, -1.269024, -2.981995, -0.600775,
+-2.372784, -1.403389, -1.774951, -0.710243,
+-2.144161, -1.451015, -1.548653, -0.830189,
+-1.230881, -1.591893, -3.117933, -0.776146,
+-2.093233, -1.363720, -3.086478, -0.552791,
+-2.849869, -1.394593, -1.851350, -0.621405,
+-2.035207, -1.361479, -1.604425, -0.886586,
+-1.643628, -1.411828, -2.833202, -0.684781,
+-1.950659, -1.192975, -3.049265, -0.679030,
+-2.433607, -1.368903, -1.740465, -0.728867,
+-2.607614, -1.487025, -1.497954, -0.740956,
+-1.488077, -1.824547, -3.839406, -0.525269,
+-2.147098, -1.305533, -3.574202, -0.537663,
+-1.856296, -1.484734, -2.287076, -0.662377,
+-2.698478, -1.445718, -1.825992, -0.623514,
+-1.498772, -1.598855, -3.850102, -0.592053,
+-2.253792, -1.309333, -3.101084, -0.544728,
+-2.174746, -1.219241, -2.531417, -0.670677,
+-2.595253, -1.535482, -1.565635, -0.691018,
+-1.575535, -1.900956, -13.676253, -0.440559,
+-2.659255, -1.832580, -3.506543, -0.301106,
+-3.144136, -1.920374, -1.920374, -0.409787,
+-2.846694, -1.930405, -1.748084, -0.473582,
+-0.709151, -0.741940, -3.449944, -13.353481,
+-1.180626, -0.462162, -2.764736, -14.054530,
+-1.213924, -0.808460, -1.357024, -13.825465,
+-1.806148, -0.774978, -0.980829, -15.538278,
+-1.673976, -1.386294, -5.257398, -0.584668,
+-1.896313, -1.408327, -3.190229, -0.572410,
+-2.509596, -1.378197, -2.286453, -0.570859,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.826244, -1.774951, -13.981029, -0.401239,
+-1.825497, -1.208724, -3.272406, -0.688420,
+-3.068012, -1.681755, -1.815285, -0.503109,
+-2.182715, -1.230058, -1.752860, -0.863433,
+-1.395511, -1.324052, -13.901692, -0.721058,
+-1.635755, -1.165752, -2.957505, -0.817446,
+-2.100057, -1.326871, -1.945907, -0.756328,
+-2.239257, -1.195134, -1.746781, -0.875954,
+-14.062374, -14.062374, -14.062374, -0.000002,
+-14.739771, -14.739771, -14.739771, -0.000001,
+-13.795312, -13.795312, -13.795312, -0.000003,
+-15.131920, -15.131920, -15.131920, -0.000001,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.956753, -15.956753, -15.956753, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.848572, -1.515303, -1.205149, -1.129641,
+-14.108183, -0.000002, -14.108183, -14.108183,
+-14.441451, -0.000002, -14.441451, -14.441451,
+-14.751606, -0.000001, -14.751606, -14.751606,
+-14.827113, -0.000001, -14.827113, -14.827113,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000000, -15.956753, -15.956753, -15.956753,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.956753, -15.956753, -0.000000, -15.956753,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.181892, -2.111683, -0.790575, -2.131291,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.470176, -1.539168, -1.504077, -1.098613,
+-1.200742, -1.744356, -3.025275, -0.742911,
+-1.370870, -1.586389, -1.693157, -1.028584,
+-1.907068, -1.619387, -0.877452, -1.437066,
+-1.041454, -1.654558, -1.533930, -1.426300,
+-1.392092, -1.392092, -2.320075, -0.904797,
+-1.219241, -1.615135, -1.456912, -1.299283,
+-1.569959, -1.689760, -1.072121, -1.327646,
+-1.168268, 0.000100, 0.000100, 0.000100,
+-1.703009, 0.000100, 0.000100, 0.000100,
+-1.748620, 0.000100, 0.000100, 0.000100,
+-1.099773, 0.000100, 0.000100, 0.000100,
+-1.047693, -1.762345, -1.447852, -1.416600,
+-1.067262, -1.392684, -2.976794, -1.030895,
+-1.832580, -1.399717, -1.160488, -1.272966,
+-1.485095, -1.652149, -1.341994, -1.137694,
+-1.073611, -1.617912, -1.442708, -1.497768,
+-1.017267, -1.512588, -2.778248, -1.033016,
+-1.573810, -1.456028, -1.135556, -1.434049,
+-1.533930, -1.552279, -1.498212, -1.052628,
+-1.163151, -1.510347, -1.353505, -1.568616,
+-0.777706, -1.391714, -2.822453, -1.459156,
+-1.098613, -1.419084, -1.441557, -1.670398,
+-1.414579, -1.431108, -1.555161, -1.181648,
+-1.102152, -1.753626, -1.411340, -1.382767,
+-1.069625, -1.489478, -2.679057, -1.014055,
+-1.128466, -1.670062, -1.422226, -1.398129,
+-1.410987, -1.712091, -1.451809, -1.074515,
+-1.152060, -1.517174, -1.405057, -1.517174,
+-0.956733, -1.386294, -3.713549, -1.074515,
+-1.235472, -1.409825, -1.592146, -1.340832,
+-1.677875, -1.314970, -1.471539, -1.154627,
+-1.110883, -1.698668, -1.479980, -1.346449,
+-1.061399, -1.274321, -3.154624, -1.103363,
+-1.148623, -1.880989, -1.148623, -1.544518,
+-1.572774, -1.495813, -1.341662, -1.180732,
+-1.012578, -1.831604, -1.556193, -1.325670,
+-1.220502, -1.004794, -3.600030, -1.166435,
+-0.988265, -1.555370, -1.609437, -1.529395,
+-1.664420, -1.708871, -1.273554, -1.050411,
+-1.072046, -1.776491, -1.433547, -1.386294,
+-0.816208, -1.533452, -2.682069, -1.295781,
+-1.222550, -1.538402, -1.404871, -1.404871,
+-1.457010, -1.710790, -1.343066, -1.123004,
+-1.120353, -1.699386, -1.536867, -1.287406,
+-0.962812, -1.321756, -3.314172, -1.154702,
+-1.301137, -1.666596, -1.222665, -1.408767,
+-1.622123, -1.622123, -1.345136, -1.065552,
+-1.139435, -1.671313, -1.478410, -1.331806,
+-0.893819, -1.432814, -3.378710, -1.145133,
+-1.266672, -1.902659, -1.120960, -1.414307,
+-1.629240, -1.351609, -1.366876, -1.237199,
+-1.163942, -1.949869, -1.138941, -1.490338,
+-1.178655, -1.442806, -3.258084, -0.873274,
+-1.223776, -1.517536, -1.620190, -1.242125,
+-1.548813, -1.742969, -1.337504, -1.049823,
+-1.208131, -1.847210, -1.236302, -1.373923,
+-1.062245, -1.532248, -3.008145, -0.944462,
+-1.324419, -1.978344, -1.598855, -0.932378,
+-1.541676, -1.524581, -1.458984, -1.091260,
+-1.187166, -1.572011, -1.438480, -1.386294,
+-1.233954, -1.425009, -2.983144, -0.872941,
+-1.337238, -1.498506, -1.255560, -1.473814,
+-1.641075, -1.539292, -1.362362, -1.091716,
+-1.208508, -1.660493, -1.355111, -1.372811,
+-1.098613, -1.098613, -2.843845, -1.290504,
+-1.457557, -1.147403, -1.807759, -1.252763,
+-1.680333, -1.445020, -1.413271, -1.094818,
+-1.112218, -1.765090, -1.470852, -1.308333,
+-1.042924, -1.787363, -3.027047, -0.839983,
+-1.123459, -1.835652, -1.287088, -1.430188,
+-1.832581, -1.478410, -1.331806, -1.055553,
+-0.930621, -1.497998, -0.962037, -14.715673,
+-0.674456, -0.864213, -2.671002, -14.279247,
+-0.948600, -1.369102, -1.026158, -14.363634,
+-1.033854, -1.305169, -0.985536, -14.859316,
+-1.221466, -1.803387, -1.380531, -1.241269,
+-0.982202, -1.581038, -2.785006, -1.027154,
+-1.227445, -1.848271, -1.240024, -1.346792,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.419817, -1.697448, -1.346792, -1.155125,
+-0.985284, -1.609437, -2.525723, -1.059392,
+-1.484925, -1.414307, -1.306677, -1.348349,
+-1.475422, -1.601715, -1.492516, -1.064324,
+-1.168571, -2.035380, -1.451434, -1.126012,
+-1.011602, -1.278664, -2.605529, -1.258461,
+-1.152680, -1.663504, -1.663504, -1.186581,
+-1.336462, -1.681302, -1.518783, -1.102269,
+-14.790071, -14.790071, -14.790071, -0.000001,
+-14.316288, -14.316288, -14.316288, -0.000002,
+-14.291747, -14.291747, -14.291747, -0.000002,
+-14.812461, -14.812461, -14.812461, -0.000001,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.969596, -15.969596, -15.969596, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.402669, -1.407397, -1.659389, -1.142484,
+-14.566929, -0.000002, -14.566929, -14.566929,
+-14.562201, -0.000002, -14.562201, -14.562201,
+-14.310209, -0.000002, -14.310209, -14.310209,
+-14.827113, -0.000001, -14.827113, -14.827113,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000000, -15.969596, -15.969596, -15.969596,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.969596, -15.969596, -0.000000, -15.969596,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.179525, -1.566298, -1.517508, -1.329910,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.262242, -1.773067, -1.248997, -1.345623,
+-1.149906, -1.479385, -2.553895, -0.973450,
+-1.413106, -1.067360, -1.776010, -1.413106,
+-1.440361, -1.873996, -1.209838, -1.166666,
+-1.102932, -1.662547, -1.421386, -1.439404,
+-0.843721, -1.419084, -2.517692, -1.397105,
+-0.942960, -1.396876, -1.879727, -1.558144,
+-1.625311, -1.625311, -1.332642, -1.071427},
+/*, acc[9][]=acc[10][]=acc[11][]=acc[12][]=acc[13][]=acc[14][]=acc[15][]=NULL, */
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+/*, acc[16][]=..., */
+{-1.310701, 0.000100, 0.000100, 0.000100,
+-1.495130, 0.000100, 0.000100, 0.000100,
+-1.900595, 0.000100, 0.000100, 0.000100,
+-1.030945, 0.000100, 0.000100, 0.000100,
+-1.163853, -1.321756, -2.041570, -1.233649,
+-1.536234, -1.324926, -3.165467, -0.740673,
+-1.212448, -1.566087, -1.927099, -1.055263,
+-1.742590, -1.513404, -1.614757, -0.901808,
+-1.102573, -1.406255, -2.132190, -1.189584,
+-1.552279, -1.398129, -2.650888, -0.753773,
+-1.313172, -1.609437, -1.757857, -1.025491,
+-1.657609, -1.582574, -1.479626, -0.977535,
+-1.274655, -1.332642, -2.071597, -1.106518,
+-1.324926, -1.309177, -4.774865, -0.785930,
+-1.193923, -1.442384, -1.970449, -1.135654,
+-1.835078, -1.551310, -1.575121, -0.864063,
+-1.195941, -1.524444, -1.958079, -1.082612,
+-1.515912, -1.366876, -3.461811, -0.704982,
+-1.180443, -1.278081, -1.997201, -1.278081,
+-1.695132, -1.298478, -1.607025, -1.070039,
+-1.177656, -1.514127, -2.120262, -1.044125,
+-1.631827, -1.220092, -2.963056, -0.781838,
+-1.393215, -1.542746, -1.841238, -0.969401,
+-1.874362, -1.651219, -1.561988, -0.809653,
+-1.116470, -1.342998, -2.781472, -1.051088,
+-1.477464, -1.310410, -2.999884, -0.793450,
+-1.708108, -1.602748, -1.868450, -0.769841,
+-2.238896, -1.435084, -1.425606, -0.879559,
+-1.324222, -1.506543, -2.177709, -0.918758,
+-1.644805, -1.337320, -3.071915, -0.697016,
+-1.488077, -1.517064, -1.998901, -0.869039,
+-2.154948, -1.359387, -1.575915, -0.866419,
+-1.609437, -1.169487, -2.413807, -0.916292,
+-1.628856, -1.243194, -2.921619, -0.773191,
+-1.854937, -1.493925, -1.854937, -0.770926,
+-2.302584, -1.369053, -1.676879, -0.779369,
+-1.695298, -1.301395, -2.229378, -0.828490,
+-1.842311, -1.266948, -3.451740, -0.638340,
+-2.169050, -1.645805, -2.108426, -0.559617,
+-2.425644, -1.335001, -1.630715, -0.792677,
+-1.327454, -1.245216, -2.936881, -0.931559,
+-2.182782, -1.128135, -2.908716, -0.675129,
+-2.140063, -1.478667, -1.916921, -0.678550,
+-2.504552, -1.382411, -1.541743, -0.791185,
+-1.452986, -1.419084, -3.028509, -0.742746,
+-1.945909, -1.341994, -3.020419, -0.603237,
+-1.979356, -1.622682, -2.079439, -0.617163,
+-2.488962, -1.382254, -1.783695, -0.697204,
+-1.550597, -1.417066, -3.496485, -0.663296,
+-1.945909, -1.213023, -3.158925, -0.658989,
+-2.095968, -1.307514, -2.031429, -0.743580,
+-2.537747, -1.169051, -1.821612, -0.801781,
+-1.593308, -1.201267, -2.866263, -0.823202,
+-2.243284, -1.124673, -3.660340, -0.609895,
+-2.833202, -1.560247, -1.834683, -0.559618,
+-2.611656, -1.486378, -1.791759, -0.627865,
+-1.580450, -1.791758, -4.624876, -0.481840,
+-2.291811, -1.229569, -3.139104, -0.574162,
+-2.978912, -1.438480, -1.998093, -0.551179,
+-2.566735, -1.344330, -1.817077, -0.693147,
+-1.751752, -1.326871, -13.795312, -0.577637,
+-2.273596, -1.211353, -3.814027, -0.549557,
+-2.484900, -1.321756, -1.897118, -0.693149,
+-2.843849, -1.204561, -1.818570, -0.734558,
+-1.532897, -1.219241, -13.687682, -0.716139,
+-2.514463, -1.098613, -3.653888, -0.580050,
+-2.188637, -1.757856, -1.920374, -0.563937,
+-2.347704, -1.230045, -1.929501, -0.761615,
+-1.695614, -1.164988, -13.901692, -0.684017,
+-2.092512, -1.252763, -3.650647, -0.571045,
+-2.564941, -1.277095, -1.871800, -0.712567,
+-2.595253, -1.260254, -1.784324, -0.746800,
+-1.718999, -1.262242, -3.970246, -0.656108,
+-2.158383, -1.040904, -3.228819, -0.709787,
+-2.097137, -1.335001, -1.963607, -0.747216,
+-2.642808, -1.098612, -1.898369, -0.808125,
+-1.444563, -1.296144, -4.663343, -0.731615,
+-2.105874, -0.787299, -3.646310, -0.923564,
+-1.992427, -1.368276, -1.522426, -0.939282,
+-2.595656, -1.222969, -1.875842, -0.738516,
+-1.223776, -1.834683, -2.987351, -0.701588,
+-2.389595, -0.798508, -3.488201, -0.849152,
+-1.609437, -1.810107, -1.442384, -0.916292,
+-2.364277, -1.485029, -1.741749, -0.684637,
+-0.053655, -14.108183, -2.951918, -14.108183,
+-0.198852, -14.930654, -1.712978, -14.930654,
+-0.537856, -14.038657, -0.877071, -14.038657,
+-0.943014, -15.410851, -0.493399, -15.410851,
+-0.671575, -1.551933, -2.274933, -1.746089,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.123004, -1.751612, -1.017643, -1.967835,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.348068, -0.000001, -15.348068, -15.348068,
+-14.547880, -0.000002, -14.547880, -14.547880,
+-14.585621, -0.000001, -14.585621, -14.585621,
+-14.346141, -0.000002, -14.346141, -14.346141,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000000, -16.173531, -16.173531, -16.173531,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.173531, -16.173531, -0.000000, -16.173531,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.353718, -1.932752, -0.722914, -2.192505,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.482337, -1.825281, -1.697448, -0.847298,
+-1.023106, -1.504077, -3.084516, -0.987387,
+-1.380463, -1.991780, -1.605547, -0.888418,
+-2.131624, -1.937469, -0.781702, -1.274177,
+-1.083345, -1.977161, -1.553348, -1.166233,
+-0.995072, -1.522426, -2.333353, -1.154702,
+-1.255102, -1.810627, -1.414732, -1.176322,
+-1.616641, -1.628762, -1.207168, -1.183451,
+-1.197361, 0.000100, 0.000100, 0.000100,
+-1.545071, 0.000100, 0.000100, 0.000100,
+-1.522342, 0.000100, 0.000100, 0.000100,
+-1.322333, 0.000100, 0.000100, 0.000100,
+-1.153078, -1.733328, -1.313552, -1.432223,
+-1.005702, -1.372195, -2.981632, -1.108806,
+-1.228280, -1.492904, -1.413767, -1.430205,
+-1.592270, -1.584323, -1.289411, -1.151958,
+-1.159299, -1.739230, -1.346047, -1.384811,
+-1.006661, -1.391774, -2.693807, -1.144742,
+-1.139022, -1.554850, -1.412374, -1.491288,
+-1.498920, -1.558532, -1.285079, -1.239409,
+-1.166031, -1.636650, -1.381512, -1.416467,
+-0.964822, -1.459445, -2.869836, -1.109040,
+-1.212072, -1.576627, -1.361107, -1.429736,
+-1.497426, -1.576872, -1.262420, -1.249299,
+-1.146993, -1.774085, -1.276414, -1.453568,
+-1.033641, -1.412233, -2.732569, -1.091716,
+-1.231396, -1.453979, -1.381072, -1.499833,
+-1.560799, -1.530946, -1.309765, -1.191328,
+-1.108588, -1.725052, -1.370050, -1.436674,
+-0.993388, -1.411804, -2.961603, -1.095893,
+-1.129743, -1.594228, -1.385018, -1.498347,
+-1.532339, -1.522559, -1.296986, -1.229418,
+-1.180070, -1.706871, -1.298812, -1.433578,
+-0.961199, -1.418623, -2.722679, -1.171763,
+-1.159279, -1.650399, -1.396379, -1.399085,
+-1.530341, -1.601983, -1.313638, -1.160909,
+-1.186017, -1.691607, -1.297521, -1.439133,
+-0.983932, -1.414567, -2.861485, -1.121020,
+-1.271028, -1.541954, -1.287289, -1.471955,
+-1.565373, -1.635441, -1.374318, -1.069700,
+-1.098110, -1.650972, -1.379786, -1.500568,
+-0.985985, -1.469615, -2.719333, -1.105650,
+-1.144582, -1.561198, -1.405738, -1.484690,
+-1.500256, -1.601304, -1.332147, -1.166730,
+-1.144260, -1.760774, -1.295220, -1.444808,
+-0.988306, -1.391897, -2.923021, -1.122642,
+-1.343066, -1.483212, -1.280934, -1.451279,
+-1.574916, -1.557081, -1.267506, -1.201669,
+-1.172643, -1.671634, -1.326794, -1.438911,
+-0.983349, -1.381502, -2.940328, -1.133532,
+-1.257763, -1.510959, -1.378841, -1.414109,
+-1.554170, -1.551660, -1.387889, -1.117375,
+-1.201495, -1.619413, -1.285809, -1.492482,
+-0.998625, -1.352705, -2.744194, -1.175145,
+-1.256489, -1.438335, -1.461391, -1.401968,
+-1.567692, -1.473726, -1.267770, -1.269596,
+-1.094084, -1.725898, -1.330874, -1.501056,
+-0.930295, -1.494327, -2.833845, -1.132035,
+-1.248562, -1.567580, -1.330865, -1.425930,
+-1.485888, -1.585079, -1.286759, -1.228939,
+-1.145399, -1.818226, -1.268511, -1.434059,
+-1.028481, -1.435940, -2.747270, -1.077470,
+-1.192485, -1.586166, -1.301684, -1.515258,
+-1.525303, -1.525303, -1.320142, -1.211360,
+-1.105949, -1.689034, -1.422012, -1.413964,
+-0.903154, -1.472072, -2.743703, -1.200880,
+-1.180370, -1.475517, -1.443343, -1.478246,
+-1.476506, -1.611037, -1.327363, -1.181855,
+-1.146319, -1.710299, -1.242210, -1.548162,
+-1.004690, -1.407318, -2.741186, -1.125318,
+-1.261695, -1.538168, -1.395916, -1.368667,
+-1.484401, -1.553056, -1.326871, -1.215858,
+-1.091816, -1.697468, -1.404481, -1.444973,
+-0.987966, -1.488372, -2.827038, -1.070786,
+-1.209278, -1.603109, -1.291273, -1.489893,
+-1.540625, -1.550752, -1.410879, -1.109464,
+-1.180564, -1.697621, -1.323904, -1.411893,
+-0.955229, -1.419367, -2.884729, -1.147155,
+-1.218771, -1.470413, -1.377519, -1.503400,
+-1.490059, -1.553535, -1.332335, -1.206371,
+-1.088005, -1.689117, -1.357154, -1.509424,
+-0.952132, -1.431300, -2.588238, -1.204199,
+-1.088162, -1.744941, -1.302572, -1.529345,
+-1.540267, -1.639825, -1.166041, -1.272737,
+-1.108760, -1.773118, -1.239316, -1.557571,
+-0.931445, -1.433618, -2.740106, -1.193996,
+-1.142260, -1.538210, -1.349753, -1.575842,
+-1.412300, -1.608298, -1.272640, -1.286999,
+-1.020594, -1.826818, -1.322209, -1.550468,
+-0.840881, -1.576371, -2.765460, -1.207317,
+-0.979403, -1.793596, -1.433251, -1.516076,
+-1.385705, -1.670247, -1.251289, -1.289109,
+-0.693147, -17.008453, -0.693147, -17.008453,
+-0.143413, -16.329166, -2.012881, -16.329166,
+-0.663294, -16.556995, -0.723919, -16.556995,
+-0.789977, -16.655758, -0.604871, -16.655758,
+-1.194699, -1.692786, -1.222431, -1.520122,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.305600, -1.489847, -1.243845, -1.535679,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.812243, -0.000000, -16.812243, -16.812243,
+-16.456709, -0.000000, -16.456709, -16.456709,
+-16.822678, -0.000000, -16.822678, -16.822678,
+-16.527554, -0.000000, -16.527554, -16.527554,
+-1.386294, -1.386294, -1.386294, -1.386294,
+0.000000, -18.054533, -18.054533, -18.054533,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-18.054533, -18.054533, 0.000000, -18.054533,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.168646, -1.507906, -1.428415, -1.477747,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.108874, -1.781143, -1.326407, -1.443098,
+-0.900135, -1.516203, -2.484255, -1.235967,
+-1.213241, -1.483532, -1.397184, -1.475606,
+-1.508512, -1.582620, -1.237395, -1.261652,
+-1.147035, -1.770419, -1.313302, -1.413744,
+-0.963388, -1.531922, -2.768317, -1.080237,
+-1.146225, -1.628731, -1.367366, -1.464428,
+-1.419945, -1.605520, -1.324845, -1.232224},
+/*, acc[17][]=..., */
+{-1.579177, 0.000100, 0.000100, 0.000100,
+-1.199687, 0.000100, 0.000100, 0.000100,
+-1.669691, 0.000100, 0.000100, 0.000100,
+-1.189867, 0.000100, 0.000100, 0.000100,
+-1.530875, -1.445718, -1.348554, -1.243194,
+-1.412963, -1.163151, -2.721292, -0.972096,
+-1.375823, -1.558144, -1.583462, -1.103890,
+-1.835027, -1.215988, -1.249511, -1.357400,
+-1.424035, -1.141172, -1.591088, -1.443453,
+-1.539233, -1.052423, -2.908716, -0.962811,
+-1.391282, -1.192431, -1.519115, -1.474663,
+-1.698458, -1.281565, -1.402193, -1.226303,
+-1.517870, -1.019624, -1.833723, -1.346021,
+-1.575005, -1.039863, -2.753657, -0.978709,
+-1.327296, -1.247254, -1.732760, -1.306677,
+-1.998095, -1.209639, -1.452869, -1.102008,
+-1.519115, -1.144422, -1.902106, -1.160171,
+-1.609438, -1.107745, -3.026499, -0.864619,
+-1.438938, -1.288656, -1.871801, -1.098613,
+-2.060978, -1.120592, -1.367832, -1.231257,
+-1.475242, -1.344622, -1.547562, -1.211091,
+-1.467376, -1.000757, -2.775705, -1.080094,
+-1.527373, -1.092056, -1.691675, -1.335001,
+-1.940179, -1.098613, -1.409551, -1.277492,
+-1.386294, -1.203973, -1.966111, -1.171183,
+-1.652258, -1.007120, -2.866698, -0.951329,
+-1.489730, -1.221466, -1.934414, -1.092849,
+-2.309228, -1.046988, -1.447747, -1.156550,
+-1.425296, -1.035100, -1.877280, -1.380845,
+-1.800648, -0.943199, -3.053406, -0.920726,
+-1.667007, -1.561647, -1.466337, -0.992554,
+-2.508435, -1.113254, -1.352005, -1.104444,
+-2.098984, -0.853771, -1.865370, -1.214784,
+-1.877701, -0.938597, -2.784420, -0.931106,
+-1.744691, -1.261840, -1.501070, -1.141696,
+-2.386465, -1.031921, -1.421386, -1.170071,
+-1.838278, -1.193923, -1.515506, -1.145133,
+-2.249940, -0.902868, -2.617663, -0.876551,
+-1.779782, -1.283347, -1.710789, -0.984854,
+-2.536043, -1.177922, -1.365974, -1.027862,
+-1.486835, -0.983733, -2.442340, -1.161414,
+-2.194295, -0.834671, -2.740837, -0.941534,
+-2.514229, -1.026158, -1.719303, -0.963638,
+-2.682072, -0.998529, -1.691676, -0.970358,
+-1.553348, -1.089044, -2.079438, -1.118031,
+-2.310246, -0.949929, -2.882764, -0.780853,
+-2.810898, -1.176778, -1.632252, -0.829907,
+-2.679437, -1.070002, -1.506719, -1.003063,
+-1.988924, -0.970359, -2.944423, -0.840307,
+-2.216970, -0.960552, -3.172477, -0.762540,
+-2.491822, -1.006443, -1.932209, -0.899197,
+-3.022857, -1.048780, -1.423473, -1.021381,
+-1.597602, -1.098614, -3.332176, -0.847300,
+-2.329982, -0.964743, -3.023127, -0.748717,
+-2.108426, -1.203973, -2.169050, -0.767256,
+-3.725684, -1.137930, -1.550942, -0.813343,
+-2.079437, -1.014732, -2.995712, -0.771111,
+-3.007424, -0.765468, -3.275686, -0.803690,
+-3.036541, -1.044125, -1.883873, -0.802963,
+-3.135490, -1.233387, -1.631417, -0.755948,
+-2.360842, -1.667704, -3.970199, -0.359379,
+-2.686483, -0.855506, -2.909626, -0.793374,
+-4.143088, -1.225364, -1.891841, -0.616776,
+-2.841887, -1.133781, -1.781018, -0.795393,
+-1.592629, -1.244325, -13.287885, -0.676344,
+-2.644753, -0.776035, -3.561037, -0.820207,
+-2.468093, -1.274177, -1.826244, -0.745335,
+-2.954907, -1.009000, -1.588818, -0.969780,
+-1.878767, -1.098614, -4.276572, -0.693150,
+-2.626218, -0.889521, -3.393469, -0.727317,
+-3.745544, -1.180626, -2.205126, -0.581509,
+-3.051636, -1.023492, -1.462405, -1.016935,
+-1.729236, -0.991642, -13.337481, -0.794933,
+-1.749868, -1.064444, -3.151662, -0.824390,
+-2.890361, -1.280934, -1.791758, -0.693149,
+-2.668569, -0.850286, -1.648211, -1.168085,
+-2.028145, -0.765908, -13.946543, -0.907558,
+-2.379544, -0.847298, -3.855441, -0.781603,
+-2.866263, -0.880360, -3.020411, -0.734648,
+-2.609053, -0.864363, -2.392830, -0.882601,
+-13.664692, -0.000003, -13.664692, -13.664692,
+-15.278767, -0.000001, -15.278767, -15.278767,
+-13.122372, -0.000006, -13.122372, -13.122372,
+-15.299386, -0.000001, -15.299386, -15.299386,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.127056, -0.000000, -16.127056, -16.127056,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.995137, -1.108573, -2.225367, -1.648857,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.131920, -0.000001, -15.131920, -15.131920,
+-15.018484, -0.000001, -15.018484, -15.018484,
+-13.901692, -0.000003, -13.901692, -13.901692,
+-14.478201, -0.000002, -14.478201, -14.478201,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000000, -16.127056, -16.127056, -16.127056,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.127056, -16.127056, -0.000000, -16.127056,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.423654, -1.810769, -0.688204, -2.373420,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.504077, -1.642913, -1.561236, -0.982202,
+-1.579584, -1.154702, -2.466883, -0.931559,
+-1.603538, -1.392229, -1.623341, -1.041125,
+-2.058384, -1.710080, -0.879735, -1.285199,
+-1.300935, -1.619388, -1.570598, -1.133881,
+-1.284245, -1.158205, -2.444412, -1.132229,
+-1.446919, -1.190986, -1.386294, -1.556920,
+-2.251290, -1.244487, -0.933051, -1.545072,
+-1.322391, 0.000100, 0.000100, 0.000100,
+-1.401640, 0.000100, 0.000100, 0.000100,
+-1.493348, 0.000100, 0.000100, 0.000100,
+-1.336779, 0.000100, 0.000100, 0.000100,
+-1.084428, -1.913119, -1.127600, -1.658227,
+-1.108976, -1.278874, -2.272123, -1.242507,
+-1.326002, -1.512588, -1.304949, -1.414950,
+-1.806357, -1.526056, -1.255182, -1.098613,
+-1.314836, -1.591088, -1.185624, -1.504077,
+-1.217876, -1.178655, -2.644986, -1.122566,
+-1.381179, -1.302708, -1.265667, -1.635413,
+-1.623966, -1.648658, -1.294487, -1.089043,
+-1.054161, -1.761906, -1.527067, -1.337023,
+-1.130874, -1.271455, -2.063691, -1.309922,
+-1.375823, -1.509354, -1.509354, -1.186581,
+-1.638286, -1.439835, -1.177471, -1.344525,
+-1.497251, -1.400088, -1.199417, -1.477049,
+-1.358745, -1.198402, -2.789484, -0.967879,
+-1.170072, -1.632694, -1.397855, -1.397855,
+-1.614078, -1.591088, -1.232144, -1.185624,
+-1.197053, -1.760987, -1.130361, -1.593933,
+-1.274503, -1.098613, -2.660792, -1.148210,
+-1.338285, -1.359339, -1.131949, -1.842189,
+-1.841430, -1.347413, -1.266067, -1.205443,
+-1.224807, -1.900561, -1.278874, -1.278874,
+-0.999240, -1.474663, -3.000712, -1.040626,
+-1.438119, -1.574695, -1.193923, -1.376244,
+-1.488810, -1.360977, -1.247648, -1.466337,
+-1.228290, -1.739115, -1.243795, -1.413693,
+-1.068937, -1.398416, -2.339395, -1.160745,
+-1.241713, -1.727220, -1.167606, -1.504077,
+-1.655957, -1.145133, -1.404643, -1.404643,
+-1.314836, -1.737692, -1.126784, -1.463255,
+-1.233715, -1.045124, -3.006773, -1.178655,
+-1.558144, -1.103890, -1.396876, -1.558144,
+-1.469676, -1.742968, -1.203973, -1.220780,
+-1.172330, -1.371181, -1.499014, -1.545534,
+-1.220780, -1.123931, -2.407942, -1.237875,
+-1.464546, -1.255455, -1.464546, -1.375599,
+-1.739490, -1.516347, -1.315677, -1.088904,
+-1.088613, -1.604425, -1.342061, -1.604425,
+-0.979266, -1.469472, -2.528075, -1.156600,
+-1.290984, -1.313974, -1.437587, -1.519825,
+-1.637608, -1.349927, -1.248144, -1.349927,
+-1.146974, -1.813451, -1.425687, -1.276651,
+-1.077559, -1.175999, -2.933849, -1.211091,
+-1.122566, -1.695911, -1.279751, -1.546379,
+-1.599387, -1.481604, -1.193923, -1.317975,
+-1.202543, -1.600890, -1.559218, -1.246346,
+-1.036621, -1.235472, -2.374902, -1.340832,
+-1.464412, -1.392092, -1.369102, -1.324650,
+-1.820746, -1.321756, -1.157453, -1.358124,
+-1.410987, -1.634130, -1.046344, -1.561810,
+-1.183770, -1.235064, -2.713160, -1.088460,
+-1.336145, -1.336145, -1.403586, -1.475906,
+-1.890849, -1.313535, -1.405343, -1.093907,
+-1.365241, -1.770705, -1.193391, -1.304616,
+-1.255266, -1.093625, -2.659255, -1.171183,
+-1.364316, -1.342809, -1.504077, -1.342809,
+-1.810108, -1.266493, -1.461802, -1.130948,
+-1.136765, -1.446919, -1.517536, -1.493439,
+-1.405712, -0.968091, -2.629483, -1.194404,
+-1.195776, -1.686398, -1.175574, -1.591088,
+-1.704747, -1.315283, -1.348073, -1.237725,
+-1.247825, -1.501605, -1.437067, -1.376442,
+-1.310195, -1.180984, -2.326113, -1.122143,
+-1.261840, -1.240787, -1.710789, -1.398416,
+-1.712091, -1.133356, -1.410987, -1.371766,
+-1.108663, -1.537117, -1.448170, -1.514127,
+-1.408767, -1.085368, -2.708045, -1.046653,
+-1.477586, -1.315067, -1.091924, -1.785069,
+-1.931988, -1.414732, -1.288439, -1.089310,
+-1.274503, -1.614828, -1.488077, -1.218414,
+-1.224724, -1.208724, -2.712795, -1.075193,
+-1.249844, -1.809458, -1.270463, -1.313022,
+-1.897119, -1.501807, -1.316090, -1.024180,
+-1.235064, -1.694595, -1.289131, -1.386294,
+-1.224906, -1.391960, -2.778248, -0.927655,
+-1.243603, -1.404871, -1.404871, -1.510231,
+-1.722766, -1.435084, -1.252763, -1.211941,
+-14.513647, -0.000002, -14.513647, -14.513647,
+-14.403299, -0.000002, -14.403299, -14.403299,
+-14.386493, -0.000002, -14.386493, -14.386493,
+-14.648421, -0.000001, -14.648421, -14.648421,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.879839, -0.000000, -15.879839, -15.879839,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.067379, -1.271335, -2.737671, -1.168240,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-14.812461, -0.000001, -14.812461, -14.812461,
+-14.608505, -0.000001, -14.608505, -14.608505,
+-13.142174, -0.000006, -13.142174, -13.142174,
+-14.711600, -0.000001, -14.711600, -14.711600,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000000, -15.879839, -15.879839, -15.879839,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.879839, -15.879839, -0.000000, -15.879839,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.327164, -1.401640, -1.312912, -1.516206,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.137642, -2.010128, -1.199200, -1.410509,
+-1.316615, -1.005179, -2.628796, -1.224807,
+-1.405343, -1.464766, -1.313535, -1.367602,
+-1.687555, -1.569772, -1.127940, -1.261472,
+-1.421386, -1.624326, -1.170072, -1.381380,
+-1.343735, -1.207603, -3.135484, -0.924477,
+-1.355204, -1.533452, -1.418383, -1.258040,
+-1.638286, -1.460038, -1.103363, -1.420032},
+/*, acc[18][]=..., */
+{-1.374213, 0.000100, 0.000100, 0.000100,
+-1.352614, 0.000100, 0.000100, 0.000100,
+-1.784054, 0.000100, 0.000100, 0.000100,
+-1.138054, 0.000100, 0.000100, 0.000100,
+-1.199616, -1.463430, -1.672521, -1.274839,
+-1.377784, -1.221215, -3.152728, -0.890974,
+-1.440361, -1.360319, -1.691675, -1.132061,
+-1.609438, -1.379421, -1.510998, -1.116004,
+-1.168571, -1.395344, -1.936940, -1.213023,
+-1.543298, -1.235814, -3.057418, -0.801362,
+-1.192545, -1.365816, -1.798679, -1.287855,
+-1.845826, -1.542640, -1.286211, -1.044199,
+-1.344525, -1.224724, -2.056020, -1.147166,
+-1.390749, -1.085368, -3.470178, -0.961754,
+-1.146079, -1.759182, -1.926236, -1.009947,
+-1.830225, -1.380025, -1.513556, -0.999878,
+-1.113001, -1.415282, -2.211611, -1.142415,
+-1.454117, -1.370034, -3.194575, -0.752237,
+-1.309922, -1.370546, -1.700786, -1.225364,
+-1.933677, -1.251226, -1.317917, -1.198858,
+-1.400893, -1.143064, -2.288193, -1.098613,
+-1.464937, -1.102823, -3.275037, -0.918394,
+-1.436484, -1.325259, -2.072470, -0.992554,
+-1.908753, -1.215607, -1.539394, -1.076771,
+-1.242125, -1.279865, -2.235373, -1.120235,
+-1.546480, -1.267309, -3.139104, -0.771988,
+-1.319603, -1.618095, -1.662547, -1.064712,
+-2.022624, -1.329478, -1.521145, -0.955512,
+-1.339288, -1.153572, -2.140064, -1.188058,
+-1.433412, -1.199798, -3.397011, -0.851491,
+-1.532476, -1.461018, -1.737270, -0.978167,
+-2.435308, -1.499849, -1.592631, -0.721803,
+-1.249719, -1.386294, -1.804028, -1.208047,
+-1.950102, -1.082015, -2.768408, -0.785116,
+-1.642227, -1.236763, -1.684786, -1.106710,
+-2.533695, -1.300165, -1.480547, -0.865990,
+-1.537978, -1.216396, -2.266213, -0.954032,
+-1.735878, -1.345013, -3.261927, -0.644540,
+-1.972755, -1.303708, -1.839224, -0.842894,
+-2.384023, -1.326656, -1.675840, -0.786783,
+-1.601715, -1.601715, -2.087221, -0.748940,
+-1.759498, -1.090449, -4.110850, -0.743579,
+-2.105871, -1.310945, -2.260020, -0.684491,
+-2.369523, -1.314587, -1.448119, -0.909123,
+-1.660295, -1.617736, -3.186336, -0.561686,
+-2.306575, -1.156005, -3.579530, -0.583811,
+-2.094943, -1.284016, -2.094943, -0.740402,
+-2.388017, -1.316805, -1.488077, -0.880943,
+-1.479626, -1.213924, -3.228805, -0.830933,
+-2.290510, -1.139941, -3.206795, -0.619040,
+-2.214170, -1.521026, -2.140062, -0.589471,
+-2.714921, -1.303095, -1.457246, -0.845777,
+-1.609437, -1.364316, -3.113495, -0.693149,
+-2.541599, -1.059998, -3.139431, -0.632060,
+-2.302580, -1.284016, -2.670302, -0.590870,
+-2.569598, -1.206296, -1.671659, -0.829666,
+-1.486377, -1.540444, -3.332176, -0.646630,
+-2.253103, -1.066524, -3.222498, -0.671642,
+-2.456728, -1.188225, -2.351369, -0.664978,
+-2.867896, -1.274590, -1.852668, -0.679603,
+-1.462280, -1.110884, -3.713527, -0.880361,
+-2.646544, -1.208961, -5.590889, -0.467024,
+-2.740828, -1.588160, -2.740828, -0.405468,
+-3.191122, -1.299283, -1.646928, -0.706220,
+-2.063687, -0.741940, -13.353481, -0.924261,
+-2.438384, -1.086778, -2.964473, -0.646628,
+-2.100057, -1.449473, -2.282377, -0.614678,
+-2.672116, -1.161526, -1.721142, -0.823201,
+-1.845824, -1.440361, -13.541079, -0.502095,
+-2.309456, -0.906070, -3.479519, -0.764100,
+-2.512298, -1.413693, -1.936939, -0.631995,
+-2.390876, -1.186905, -1.659990, -0.883956,
+-1.780584, -0.992130, -13.698981, -0.775066,
+-2.422074, -0.952785, -2.907579, -0.752920,
+-2.265538, -1.367602, -2.098486, -0.656108,
+-2.774987, -1.084327, -1.770406, -0.845701,
+-2.054119, -1.265667, -3.663517, -0.572522,
+-2.658347, -0.978709, -3.803470, -0.631400,
+-2.813398, -1.203973, -2.659250, -0.562121,
+-3.079004, -1.179261, -1.818754, -0.725131,
+-0.952011, -13.253399, -2.944412, -0.577319,
+-1.274892, -14.904074, -3.128776, -0.390428,
+-2.020941, -13.629186, -1.646250, -0.393492,
+-2.277266, -15.358809, -1.637609, -0.352411,
+-14.310209, -0.000002, -14.310209, -14.310209,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-14.022528, -0.000002, -14.022528, -14.022528,
+-15.636829, -0.000001, -15.636829, -15.636829,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.916291, -1.205816, -2.603241, -1.484925,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.101986, -0.000001, -15.101986, -15.101986,
+-14.812461, -0.000001, -14.812461, -14.812461,
+-13.415039, -0.000004, -13.415039, -13.415039,
+-14.533353, -0.000002, -14.533353, -14.533353,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000000, -16.018276, -16.018276, -16.018276,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.018276, -16.018276, -0.000000, -16.018276,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.428038, -2.045761, -0.661606, -2.163543,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.428654, -1.796378, -1.573234, -0.949081,
+-1.360977, -1.584119, -2.564942, -0.773191,
+-1.454981, -1.657693, -1.603034, -0.981544,
+-2.079438, -1.699950, -0.955513, -1.178656,
+-1.190986, -1.489478, -1.654558, -1.275069,
+-1.232144, -1.273817, -2.351372, -1.098613,
+-1.407801, -1.193391, -1.572880, -1.407801,
+-1.669157, -1.343735, -1.116157, -1.499739,
+-1.200282, 0.000100, 0.000100, 0.000100,
+-1.570202, 0.000100, 0.000100, 0.000100,
+-1.555079, 0.000100, 0.000100, 0.000100,
+-1.273985, 0.000100, 0.000100, 0.000100,
+-1.276732, -1.545722, -1.340014, -1.402210,
+-1.110107, -1.266186, -2.502197, -1.181994,
+-1.215927, -1.426491, -1.507837, -1.418709,
+-1.569183, -1.529177, -1.423817, -1.095784,
+-1.143064, -1.708378, -1.279196, -1.506254,
+-1.001919, -1.371181, -2.531667, -1.205666,
+-1.179127, -1.572396, -1.255597, -1.609438,
+-1.488077, -1.572269, -1.246472, -1.276177,
+-1.183595, -1.559459, -1.341329, -1.504735,
+-1.074515, -1.365867, -2.844532, -1.063586,
+-1.319657, -1.617877, -1.168206, -1.498212,
+-1.726237, -1.378451, -1.271842, -1.238689,
+-1.060064, -1.583574, -1.482478, -1.508454,
+-1.010295, -1.382816, -2.856119, -1.116188,
+-1.316290, -1.556880, -1.457941, -1.243531,
+-1.565635, -1.558518, -1.336794, -1.145986,
+-1.282007, -1.571300, -1.225654, -1.508780,
+-0.998529, -1.255489, -2.785372, -1.255489,
+-1.315315, -1.458416, -1.392276, -1.384308,
+-1.515887, -1.574156, -1.266671, -1.232933,
+-1.315031, -1.752757, -1.253474, -1.299283,
+-1.000010, -1.364090, -2.372317, -1.261436,
+-1.285066, -1.693641, -1.063166, -1.639082,
+-1.480088, -1.570742, -1.292227, -1.238450,
+-1.306791, -1.592748, -1.212895, -1.475731,
+-1.066074, -1.314970, -2.946383, -1.094729,
+-1.356903, -1.426332, -1.350824, -1.413345,
+-1.655958, -1.586965, -1.282754, -1.116962,
+-1.239967, -1.579585, -1.229550, -1.550597,
+-1.127033, -1.427408, -2.304337, -1.089894,
+-1.275158, -1.484676, -1.371078, -1.426265,
+-1.676813, -1.529177, -1.147969, -1.276412,
+-1.148966, -1.642724, -1.356967, -1.460402,
+-1.184486, -1.342543, -2.300797, -1.100403,
+-1.328601, -1.334680, -1.378301, -1.514565,
+-1.632080, -1.314321, -1.401332, -1.239420,
+-1.179759, -1.709908, -1.296346, -1.434496,
+-0.995998, -1.502466, -2.716146, -1.073133,
+-1.211869, -1.467548, -1.467548, -1.421386,
+-1.521313, -1.468320, -1.285999, -1.291390,
+-1.376992, -1.494775, -1.300619, -1.382297,
+-0.978990, -1.452922, -2.713588, -1.126626,
+-1.235323, -1.393729, -1.431757, -1.504077,
+-1.656784, -1.445836, -1.276896, -1.221466,
+-1.310094, -1.619755, -1.304674, -1.343246,
+-0.989879, -1.359626, -2.599315, -1.213023,
+-1.219514, -1.456304, -1.426227, -1.463967,
+-1.471194, -1.635949, -1.292042, -1.201070,
+-1.414151, -1.353859, -1.291984, -1.496551,
+-0.995428, -1.323115, -3.094669, -1.143064,
+-1.324925, -1.487044, -1.324925, -1.417518,
+-1.686109, -1.339643, -1.345183, -1.229872,
+-1.208046, -1.657847, -1.293204, -1.442736,
+-1.169725, -1.159725, -3.061829, -1.111171,
+-1.289668, -1.493266, -1.276595, -1.509527,
+-1.616873, -1.384814, -1.299777, -1.278271,
+-1.225245, -1.486610, -1.341779, -1.519616,
+-1.235149, -1.224099, -2.931595, -1.016459,
+-1.262850, -1.336462, -1.477961, -1.485994,
+-1.560366, -1.525760, -1.279751, -1.222888,
+-1.308185, -1.605016, -1.324714, -1.335888,
+-0.964483, -1.417066, -2.777383, -1.157743,
+-1.211636, -1.465701, -1.449307, -1.441210,
+-1.554917, -1.554917, -1.188763, -1.298326,
+-1.302968, -1.562291, -1.206274, -1.517267,
+-1.196804, -1.173274, -2.559566, -1.167477,
+-1.229568, -1.742453, -1.267309, -1.382691,
+-1.472646, -1.551635, -1.309495, -1.241937,
+-1.163443, -1.566574, -1.381924, -1.479563,
+-1.012483, -1.377515, -2.676796, -1.152974,
+-1.325084, -1.525060, -1.389622, -1.318853,
+-1.503331, -1.483397, -1.276169, -1.303271,
+-1.202602, -1.491807, -1.278468, -1.628829,
+-1.096856, -1.402238, -2.630306, -1.055614,
+-1.212525, -1.485994, -1.535591, -1.343431,
+-1.765557, -1.413202, -1.235944, -1.220830,
+-1.061518, -15.744130, -1.113254, -1.122143,
+-0.764794, -15.595535, -2.274319, -0.840017,
+-0.964858, -15.536490, -1.105794, -1.244745,
+-1.471712, -15.738298, -1.002505, -0.907557,
+-15.994798, -0.000000, -15.994798, -15.994798,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.793751, -0.000001, -15.793751, -15.793751,
+-16.031448, -0.000000, -16.031448, -16.031448,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.089541, -1.187210, -2.877773, -1.196342,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.954400, -0.000000, -15.954400, -15.954400,
+-15.856731, -0.000000, -15.856731, -15.856731,
+-14.166170, -0.000002, -14.166170, -14.166170,
+-15.847599, -0.000000, -15.847599, -15.847599,
+-1.386294, -1.386294, -1.386294, -1.386294,
+0.000000, -17.043941, -17.043941, -17.043941,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-17.043941, -17.043941, -0.000000, -17.043941,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.254349, -1.468849, -1.318888, -1.527325,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.172080, -1.582039, -1.227393, -1.651997,
+-1.112477, -1.423108, -2.809401, -0.994113,
+-1.299777, -1.439539, -1.367218, -1.445808,
+-1.561343, -1.544101, -1.218679, -1.269323,
+-1.246472, -1.664207, -1.261214, -1.427818,
+-0.980240, -1.477013, -2.424801, -1.177770,
+-1.100338, -1.465188, -1.465188, -1.583904,
+-1.371391, -1.657102, -1.223175, -1.342234},
+/*, acc[19..24][]=...NULL, */
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+};
+
diff --git a/libsim4/sim4core/sites_acceptor.H b/libsim4/sim4core/sites_acceptor.H
new file mode 100644
index 0000000..c28543b
--- /dev/null
+++ b/libsim4/sim4core/sites_acceptor.H
@@ -0,0 +1,11 @@
+#ifndef SITES_ACCEPTOR_H
+#define SITES_ACCEPTOR_H
+
+/* DO NOT REMOVE or MODIFY !!!! */
+
+#define NUM_MODELS_ACC 25
+#define NUM_VALUES_ACC 928
+
+extern double acc[NUM_MODELS_ACC][NUM_VALUES_ACC];
+
+#endif /* SITES_ACCEPTOR_H */
diff --git a/libsim4/sim4core/sites_donor.C b/libsim4/sim4core/sites_donor.C
new file mode 100644
index 0000000..ce7702a
--- /dev/null
+++ b/libsim4/sim4core/sites_donor.C
@@ -0,0 +1,1537 @@
+#include "sim4.H"
+
+/* DO NOT REMOVE or MODIFY !!!! */
+
+double don[NUM_MODELS_DON][NUM_VALUES_DON] =
+{/*, don[0]=..., */
+{-1.194022, 0.000100, 0.000100, 0.000100,
+-1.488549, 0.000100, 0.000100, 0.000100,
+-1.590126, 0.000100, 0.000100, 0.000100,
+-1.319010, 0.000100, 0.000100, 0.000100,
+-1.117603, -1.599586, -1.315587, -1.596323,
+-0.968354, -1.333230, -2.393915, -1.326530,
+-0.973824, -1.341693, -1.554107, -1.899903,
+-1.563394, -1.337397, -1.211732, -1.468084,
+-0.998394, -1.139395, -1.632676, -2.153211,
+-0.818638, -1.064793, -2.641517, -1.945501,
+-0.946117, -1.071815, -1.639264, -2.587007,
+-1.473391, -0.957669, -1.381619, -1.995872,
+-0.454442, -2.214835, -2.129729, -1.986628,
+-0.293988, -2.510470, -2.991967, -2.093170,
+-0.434752, -2.092543, -2.115746, -2.219543,
+-1.262582, -1.757518, -1.302403, -1.299283,
+-2.384259, -3.736880, -0.175496, -3.101671,
+-1.513713, -2.694443, -0.710651, -1.509572,
+-1.858238, -2.956849, -0.358565, -2.370948,
+-3.019029, -3.316280, -0.183299, -2.496840,
+-16.187622, -16.187622, -0.000000, -16.187622,
+-15.027452, -15.027452, -0.000001, -15.027452,
+-18.177462, -18.177462, 0.000000, -18.177462,
+-15.831747, -15.831747, -0.000000, -15.831747,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-18.420681, -18.420681, -18.420681, 0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.478520, -3.470547, -1.146963, -3.454598,
+-0.415031, -2.375713, -2.259474, -1.949628,
+-0.328148, -2.520914, -3.174837, -1.847972,
+-0.183708, -3.100533, -2.491223, -3.219190,
+-0.935461, -2.922524, -1.451677, -1.140622,
+-2.447088, -3.145141, -0.202256, -2.927916,
+-1.341329, -1.830612, -0.989187, -1.578387,
+-2.560945, -3.098087, -0.182121, -3.120559,
+-2.420583, -2.470345, -0.358279, -2.058100,
+-1.370294, -1.902511, -1.226400, -1.192613,
+-1.203973, -1.382792, -2.171248, -1.093363,
+-1.716096, -2.018802, -1.959097, -0.604343,
+-1.670320, -2.090491, -1.139263, -0.999314,
+-1.061786, -1.735801, -1.364941, -1.502794,
+-1.011423, -1.508000, -2.117894, -1.221927,
+-1.121546, -1.432701, -1.521532, -1.527011,
+-1.171592, -1.915998, -1.144701, -1.493377,
+-1.360640, -1.488907, -1.564893, -1.175145,
+-1.235830, -1.235830, -2.437877, -1.104227,
+-1.434473, -1.351348, -1.381335, -1.379815,
+-1.703031, -1.583154, -1.339306, -1.048351,
+-1.244275, -1.529555, -1.482686, -1.316055,
+-1.292946, -1.202040, -2.470091, -1.077688,
+-1.391529, -1.361451, -1.216870, -1.615149,
+-1.828850, -1.450746, -1.320862, -1.084409,
+-1.227998, -1.585702, -1.417540, -1.347041,
+-1.264398, -1.287256, -2.297413, -1.075741,
+-1.543335, -1.419175, -1.116076, -1.528520,
+-1.754513, -1.551330, -1.286488, -1.082335,
+-1.185089, -1.636288, -1.392199, -1.382098,
+-1.196416, -1.309063, -2.332659, -1.106804,
+-1.515828, -1.347139, -1.162641, -1.571509,
+-1.763248, -1.524356, -1.332962, -1.058267,
+-1.278840, 0.000100, 0.000100, 0.000100,
+-1.577067, 0.000100, 0.000100, 0.000100,
+-1.585331, 0.000100, 0.000100, 0.000100,
+-1.170603, 0.000100, 0.000100, 0.000100,
+-1.189564, -1.653730, -1.417380, -1.339552,
+-1.076494, -1.348020, -2.793821, -1.083908,
+-1.335665, -1.489445, -1.400451, -1.327900,
+-1.564922, -1.589900, -1.259134, -1.193858,
+-1.165974, -1.762895, -1.387537, -1.319968,
+-1.060081, -1.440125, -2.778800, -1.036874,
+-1.356823, -1.631915, -1.476295, -1.144179,
+-1.552356, -1.644394, -1.317984, -1.116427,
+-1.128674, -1.567696, -1.470539, -1.434596,
+-0.991671, -1.306884, -2.611551, -1.255403,
+-1.266786, -1.487478, -1.395204, -1.408377,
+-1.601340, -1.509013, -1.205371, -1.281340,
+-1.131920, -2.753591, -1.393460, -1.006017,
+-0.901514, -2.705863, -2.954560, -0.744164,
+-1.265302, -2.731009, -1.407433, -0.896680,
+-1.460255, -2.900763, -1.202876, -0.885489,
+-18.855316, -18.855316, 0.000000, -18.855316,
+-17.256889, -17.256889, -0.000000, -17.256889,
+-18.498844, -18.498844, 0.000000, -18.498844,
+-19.144358, -19.144358, 0.000000, -19.144358,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-20.030119, -20.030119, -20.030119, 0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.518318, -1.650989, -1.228402, -1.216383,
+-1.156200, -1.788296, -1.501707, -1.219676,
+-1.085152, -1.409175, -2.928513, -1.009700,
+-1.359202, -1.665857, -1.398529, -1.180433,
+-1.588399, -1.610113, -1.532489, -0.967886,
+-1.195646, -1.629226, -1.437924, -1.331844,
+-1.045906, -1.402171, -2.878129, -1.060378,
+-1.304552, -1.549151, -1.341843, -1.366829,
+-1.600224, -1.601107, -1.285358, -1.139672,
+-1.175097, -1.835931, -1.409790, -1.246354,
+-1.099655, -1.419533, -2.840918, -1.002897,
+-1.366974, -1.666650, -1.447375, -1.136135,
+-1.574024, -1.622230, -1.372203, -1.073574,
+-1.158644, -1.718377, -1.426295, -1.322253,
+-0.994002, -1.441001, -2.797775, -1.101803,
+-1.285864, -1.604010, -1.342650, -1.341906,
+-1.551531, -1.705969, -1.221082, -1.166133,
+-1.173539, -1.749182, -1.426353, -1.285074,
+-1.020789, -1.462937, -2.878790, -1.044319,
+-1.337747, -1.600078, -1.440283, -1.207906,
+-1.534570, -1.623598, -1.400524, -1.076472,
+-1.155850, -1.752763, -1.407940, -1.319500,
+-1.056948, -1.422322, -2.716046, -1.063643,
+-1.242005, -1.579490, -1.397563, -1.355118,
+-1.566043, -1.664274, -1.253461, -1.151105,
+-1.178903, -1.767702, -1.417641, -1.275203,
+-1.017385, -1.426898, -2.732214, -1.098612,
+-1.337898, -1.663610, -1.408090, -1.192244,
+-1.532042, -1.609303, -1.347952, -1.126681,
+-1.170419, -1.682621, -1.438948, -1.321631,
+-1.011322, -1.395969, -2.827050, -1.110233,
+-1.257048, -1.544841, -1.414122, -1.350840,
+-1.561129, -1.634762, -1.267677, -1.159581},
+/*, don[1][]=NULL, */
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+/*, don[2][]=..., */
+{-1.127750, 0.000100, 0.000100, 0.000100,
+-1.544002, 0.000100, 0.000100, 0.000100,
+-1.618277, 0.000100, 0.000100, 0.000100,
+-1.330011, 0.000100, 0.000100, 0.000100,
+-0.951229, -1.629670, -1.539015, -1.593824,
+-1.003500, -1.345906, -2.201571, -1.337608,
+-0.923325, -1.405163, -1.728389, -1.715317,
+-1.479855, -1.370656, -1.427408, -1.278485,
+-0.959851, -1.258190, -1.581417, -2.062003,
+-0.747872, -1.308530, -2.617591, -1.695911,
+-0.855382, -1.230075, -1.695132, -2.312292,
+-1.441921, -1.047820, -1.459016, -1.712796,
+-0.827904, -1.335001, -1.662214, -2.205829,
+-0.520725, -1.510318, -2.659481, -2.162079,
+-0.685304, -1.251451, -1.848454, -2.947063,
+-1.273415, -1.251192, -1.355332, -1.736699,
+-0.557657, -1.910278, -16.137899, -1.275069,
+-0.837769, -2.018499, -15.532907, -0.833628,
+-0.658662, -1.757273, -15.045153, -1.171372,
+-1.232144, -1.529395, -14.690981, -0.709955,
+-16.187622, -16.187622, -0.000000, -16.187622,
+-15.027452, -15.027452, -0.000001, -15.027452,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.831747, -15.831747, -0.000000, -15.831747,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.887741, -16.887741, -16.887741, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.442502, -5.038336, -1.076170, -4.586354,
+-0.196115, -3.322874, -3.045243, -2.359701,
+-0.559632, -1.540440, -11.849426, -1.540440,
+-0.061645, -4.036274, -4.036274, -3.710854,
+-0.788466, -2.397863, -1.299284, -1.704741,
+-4.264841, -5.913483, -0.022412, -5.220346,
+-2.022278, -2.610056, -0.307489, -2.833194,
+-2.833198, -3.749459, -0.125166, -3.344010,
+-2.970405, -1.553348, -0.349377, -3.440401,
+-0.926765, -2.484890, -1.386294, -1.306252,
+-2.420352, -0.280308, -3.806571, -2.014895,
+-2.528913, -2.659819, -3.100874, -0.216561,
+-1.011607, -1.299284, -1.704741, -1.704741,
+-0.873410, -1.771350, -1.630271, -1.530188,
+-1.188058, -1.542229, -2.286667, -0.968429,
+-0.977777, -1.257361, -1.918757, -1.646824,
+-0.915692, -2.017303, -1.312957, -1.620887,
+-1.537694, -1.438396, -1.758094, -0.979649,
+-1.212187, -1.380178, -2.742371, -0.950616,
+-1.504077, -1.287854, -1.548137, -1.240388,
+-1.796079, -1.534714, -1.524764, -0.914138,
+-1.304949, -1.447449, -1.592631, -1.236998,
+-1.243320, -1.113909, -2.899902, -1.113909,
+-1.448900, -1.362825, -1.228006, -1.530817,
+-1.762735, -1.456568, -1.429466, -1.032954,
+-1.164345, -1.656242, -1.576199, -1.236984,
+-1.316000, -1.215286, -2.441458, -1.055165,
+-1.442202, -1.536512, -1.227091, -1.365241,
+-1.762345, -1.616928, -1.331128, -1.006020,
+-1.171985, -1.716286, -1.399769, -1.332328,
+-1.164384, -1.268283, -2.524701, -1.119360,
+-1.564220, -1.435966, -1.107462, -1.502977,
+-1.909182, -1.533276, -1.437966, -0.919895,
+-1.277291, 0.000100, 0.000100, 0.000100,
+-1.598323, 0.000100, 0.000100, 0.000100,
+-1.608086, 0.000100, 0.000100, 0.000100,
+-1.143497, 0.000100, 0.000100, 0.000100,
+-1.165809, -1.660611, -1.464417, -1.320155,
+-1.052982, -1.381888, -2.799833, -1.081225,
+-1.326450, -1.481961, -1.433251, -1.313571,
+-1.558245, -1.601781, -1.260439, -1.189321,
+-1.142379, -1.779521, -1.433138, -1.295858,
+-1.044407, -1.476549, -2.864414, -1.014294,
+-1.373199, -1.697648, -1.510690, -1.070511,
+-1.522434, -1.671029, -1.363123, -1.084988,
+-1.120639, -1.402516, -1.528047, -1.555932,
+-1.063424, -1.073184, -2.579814, -1.439602,
+-1.324647, -1.259512, -1.442013, -1.542435,
+-1.646956, -1.349078, -1.252099, -1.339483,
+-0.846615, -2.468286, -18.492071, -0.720712,
+-0.848006, -2.652355, -18.499954, -0.690656,
+-0.984568, -2.450274, -18.256395, -0.615946,
+-1.103110, -2.543618, -18.330427, -0.528343,
+-18.855316, -18.855316, 0.000000, -18.855316,
+-17.256889, -17.256889, -0.000000, -17.256889,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-19.144358, -19.144358, 0.000000, -19.144358,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-19.786440, -19.786440, -19.786440, 0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.503070, -1.656781, -1.237191, -1.215359,
+-1.161706, -1.788209, -1.518696, -1.201338,
+-1.092477, -1.408889, -2.945510, -1.000691,
+-1.369898, -1.660583, -1.481816, -1.112871,
+-1.566330, -1.624434, -1.559759, -0.957165,
+-1.188177, -1.643856, -1.449267, -1.319579,
+-1.037927, -1.418877, -2.862330, -1.059322,
+-1.289732, -1.546512, -1.390807, -1.336394,
+-1.594081, -1.631040, -1.269707, -1.138606,
+-1.176095, -1.811145, -1.444429, -1.230414,
+-1.107634, -1.429539, -2.923137, -0.976900,
+-1.393215, -1.665943, -1.507297, -1.075275,
+-1.559653, -1.603848, -1.422090, -1.057104,
+-1.140343, -1.705881, -1.455260, -1.326748,
+-1.000762, -1.420308, -2.853456, -1.099263,
+-1.304924, -1.589897, -1.383572, -1.293975,
+-1.544923, -1.725755, -1.207386, -1.172295,
+-1.153519, -1.758848, -1.456136, -1.276196,
+-1.007714, -1.473459, -2.933208, -1.042450,
+-1.329920, -1.616170, -1.483927, -1.170868,
+-1.528306, -1.635882, -1.436988, -1.047898,
+-1.125503, -1.769071, -1.440820, -1.315701,
+-1.052560, -1.440463, -2.739929, -1.051069,
+-1.229160, -1.574676, -1.437955, -1.335698,
+-1.563993, -1.669438, -1.252803, -1.149972,
+-1.159321, -1.775033, -1.432411, -1.279810,
+-1.014332, -1.447764, -2.758530, -1.082123,
+-1.374357, -1.662961, -1.441048, -1.137120,
+-1.516585, -1.625889, -1.361823, -1.115932,
+-1.151594, -1.669701, -1.484179, -1.313506,
+-1.006378, -1.410411, -2.831643, -1.104126,
+-1.254718, -1.548698, -1.437382, -1.328892,
+-1.547701, -1.672813, -1.258016, -1.154143},
+/*, don[3][]=NULL, */
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+/*, don[4][]=..., */
+{-1.084913, 0.000100, 0.000100, 0.000100,
+-1.549219, 0.000100, 0.000100, 0.000100,
+-1.861738, 0.000100, 0.000100, 0.000100,
+-1.223316, 0.000100, 0.000100, 0.000100,
+-1.050795, -1.537275, -1.423721, -1.637026,
+-0.872875, -1.291378, -2.669827, -1.435084,
+-0.735450, -1.368698, -1.664162, -2.564947,
+-1.325670, -1.405222, -1.319805, -1.505641,
+-0.859737, -0.978275, -1.998415, -2.730028,
+-0.835375, -0.784300, -3.010997, -2.803358,
+-0.833135, -0.880889, -1.921437, -5.402628,
+-1.246842, -0.827697, -1.520844, -2.864576,
+-0.167783, -3.276570, -3.189559, -2.583425,
+-0.096419, -4.097666, -3.809986, -2.934520,
+-0.101907, -4.280108, -3.027364, -3.363833,
+-0.665998, -2.494116, -1.600305, -1.600305,
+-16.747238, -16.747238, -0.000000, -16.747238,
+-13.321220, -13.321220, -0.000005, -13.321220,
+-13.732133, -13.732133, -0.000003, -13.732133,
+-14.193950, -14.193950, -0.000002, -14.193950,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.895124, -16.895124, -0.000000, -16.895124,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.895124, -16.895124, -16.895124, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.151090, -4.465904, -2.179452, -4.158421,
+-0.554549, -1.557343, -2.650891, -1.935272,
+-0.820987, -1.272967, -3.218792, -1.427116,
+-0.399387, -1.535039, -2.940376, -2.797276,
+-1.734596, -1.734596, -1.223777, -1.041457,
+-0.760663, -1.431905, -16.346027, -1.224891,
+-0.884203, -1.369053, -15.341568, -1.100789,
+-0.773191, -1.288656, -14.260199, -1.336284,
+-1.159856, -1.539345, -14.924074, -0.750889,
+-1.399088, -1.880313, -1.218914, -1.187166,
+-1.145405, -1.572050, -2.104514, -1.043043,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.699803, -2.128798, -1.125893, -0.983666,
+-1.050633, -1.727693, -1.607549, -1.301175,
+-0.968053, -1.585976, -2.323573, -1.147254,
+-1.280235, -1.657791, -1.621751, -1.096866,
+-1.236612, -1.939148, -1.319384, -1.208960,
+-1.337921, -1.554144, -1.794679, -1.021490,
+-1.155771, -1.290503, -2.469156, -1.122710,
+-1.311232, -1.482050, -1.379635, -1.379635,
+-1.426361, -1.612463, -1.560791, -1.048528,
+-1.261436, -1.517115, -1.651772, -1.185153,
+-1.130161, -1.392525, -2.624666, -1.032522,
+-1.351723, -1.432575, -1.371331, -1.391332,
+-1.681933, -1.448318, -1.581850, -0.985034,
+-1.173274, -1.558594, -1.492454, -1.364941,
+-1.210530, -1.472237, -2.496739, -0.941106,
+-1.487714, -1.371181, -1.275871, -1.422474,
+-1.681933, -1.623523, -1.528857, -0.916291,
+-1.071511, -1.629790, -1.770019, -1.233715,
+-1.146035, -1.280513, -2.697577, -1.088141,
+-1.297741, -1.306252, -1.493714, -1.463255,
+-1.654771, -1.584813, -1.525779, -0.950814,
+-1.266370, 0.000100, 0.000100, 0.000100,
+-1.491946, 0.000100, 0.000100, 0.000100,
+-1.541319, 0.000100, 0.000100, 0.000100,
+-1.276135, 0.000100, 0.000100, 0.000100,
+-1.283218, -1.642509, -1.262101, -1.401191,
+-1.173902, -1.223035, -2.889842, -1.076104,
+-1.345962, -1.513016, -1.333223, -1.363204,
+-1.613285, -1.490518, -1.302845, -1.191536,
+-1.218639, -1.739363, -1.243837, -1.425202,
+-1.112642, -1.368903, -2.525986, -1.087835,
+-1.259947, -1.459995, -1.413163, -1.424116,
+-1.653757, -1.556271, -1.188394, -1.227450,
+-1.179645, -2.718660, -1.272526, -1.059798,
+-0.790659, -3.028704, -2.778654, -0.830219,
+-1.111371, -2.961830, -1.332975, -1.034296,
+-1.418085, -2.887375, -1.028677, -1.065311,
+-17.125324, -17.125324, -0.000000, -17.125324,
+-15.363074, -15.363074, -0.000001, -15.363074,
+-16.861937, -16.861937, -0.000000, -16.861937,
+-17.244972, -17.244972, -0.000000, -17.244972,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-18.246090, -18.246090, 0.000000, -18.246090,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-18.246090, -18.246090, -18.246090, 0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.569521, -1.634518, -1.243400, -1.176336,
+-1.117089, -1.582905, -1.518193, -1.393183,
+-1.168783, -1.210916, -3.006781, -1.073295,
+-1.323099, -1.459069, -1.232734, -1.561867,
+-1.698206, -1.393268, -1.567844, -1.021008,
+-0.939280, -1.294295, -16.906553, -1.093625,
+-1.020407, -1.290232, -16.837399, -1.009655,
+-0.995359, -1.202025, -16.654589, -1.109195,
+-1.326903, -1.187006, -17.008043, -0.844973,
+-1.171385, -1.934166, -1.290290, -1.308124,
+-1.073367, -1.386707, -2.606201, -1.095319,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.630663, -1.695422, -1.202359, -1.138829,
+-1.251045, -1.707330, -1.377310, -1.272291,
+-0.991367, -1.460693, -2.644101, -1.121514,
+-1.260420, -1.583989, -1.249350, -1.493351,
+-1.545626, -1.638939, -1.337873, -1.107958,
+-1.249782, -1.696255, -1.374846, -1.283068,
+-1.078600, -1.409146, -2.718208, -1.051012,
+-1.376973, -1.540098, -1.340531, -1.303454,
+-1.523039, -1.586829, -1.315751, -1.174072,
+-1.310988, -1.679441, -1.297958, -1.306082,
+-1.069397, -1.347161, -2.716063, -1.106626,
+-1.242621, -1.566901, -1.327945, -1.436910,
+-1.526374, -1.642559, -1.311676, -1.139981,
+-1.232082, -1.765550, -1.362632, -1.268560,
+-1.035108, -1.340604, -2.609841, -1.172545,
+-1.205373, -1.636560, -1.352253, -1.397904,
+-1.555996, -1.585555, -1.310958, -1.156373,
+-1.230524, -1.701391, -1.356071, -1.317666,
+-1.006103, -1.351503, -2.788383, -1.158403,
+-1.273042, -1.501938, -1.403574, -1.379891,
+-1.608615, -1.505108, -1.344353, -1.148433},
+/*, don[5][]=NULL, */
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+/*, don[6][]=..., */
+{-1.166591, 0.000100, 0.000100, 0.000100,
+-1.526530, 0.000100, 0.000100, 0.000100,
+-1.571106, 0.000100, 0.000100, 0.000100,
+-1.333818, 0.000100, 0.000100, 0.000100,
+-1.215445, -1.511711, -1.302457, -1.555771,
+-0.914930, -1.439454, -2.408852, -1.300192,
+-0.919144, -1.383449, -1.641278, -1.853452,
+-1.679233, -1.269761, -1.192800, -1.473101,
+-1.077916, -1.039555, -1.630572, -2.204857,
+-0.843300, -1.000248, -2.564947, -2.079441,
+-1.077724, -1.101254, -1.426677, -2.441027,
+-1.538635, -0.904137, -1.343735, -2.123892,
+-0.274809, -3.006387, -2.554403, -2.179710,
+-0.171953, -3.161895, -3.475552, -2.468750,
+-0.217724, -3.001958, -2.639054, -2.596495,
+-1.339288, -1.973010, -1.319086, -1.103975,
+-16.345232, -16.345232, -0.000000, -16.345232,
+-13.774693, -13.774693, -0.000003, -13.774693,
+-14.122998, -14.122998, -0.000002, -14.122998,
+-14.518610, -14.518610, -0.000002, -14.518610,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.642233, -16.642233, -0.000000, -16.642233,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.642233, -16.642233, -16.642233, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.463983, -3.227198, -1.420625, -2.408011,
+-16.178249, -2.085107, -0.860886, -0.792042,
+-13.415039, -1.496642, -1.719784, -0.515817,
+-15.221608, -1.667461, -0.543208, -1.467972,
+-14.234223, -2.826648, -0.963438, -0.581231,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-14.661381, -14.661381, -0.000001, -14.661381,
+-15.838383, -15.838383, -0.000000, -15.838383,
+-15.761421, -15.761421, -0.000000, -15.761421,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.695320, -1.959621, -1.918463, -0.637305,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.193278, -1.517064, -1.317732, -1.562185,
+-1.223776, -1.361397, -2.071072, -1.128466,
+-1.323774, -1.354546, -1.386294, -1.488077,
+-1.230066, -1.896747, -1.048383, -1.574231,
+-1.425759, -1.486914, -1.523615, -1.153019,
+-1.215249, -1.215249, -2.531861, -1.117132,
+-1.555586, -1.292476, -1.378131, -1.338285,
+-1.804900, -1.604229, -1.194445, -1.103848,
+-1.269971, -1.612066, -1.410200, -1.288839,
+-1.432701, -1.201589, -2.300200, -1.020591,
+-1.488077, -1.365017, -1.173962, -1.562185,
+-1.866016, -1.538512, -1.224162, -1.088988,
+-1.171270, -1.534175, -1.383603, -1.497361,
+-1.386294, -1.275069, -1.950823, -1.113428,
+-1.577611, -1.381332, -1.061978, -1.627004,
+-1.708693, -1.595752, -1.193529, -1.161577,
+-1.243874, -1.812378, -1.307823, -1.279914,
+-1.215768, -1.342061, -2.202261, -1.103650,
+-1.689105, -1.432385, -1.026921, -1.521113,
+-1.802809, -1.577002, -1.245100, -1.076873,
+-1.382168, 0.000100, 0.000100, 0.000100,
+-1.545379, 0.000100, 0.000100, 0.000100,
+-1.379818, 0.000100, 0.000100, 0.000100,
+-1.258390, 0.000100, 0.000100, 0.000100,
+-1.207902, -1.563453, -1.298499, -1.519490,
+-1.126704, -1.283708, -2.251290, -1.225439,
+-1.511144, -1.500562, -1.218158, -1.344909,
+-1.632572, -1.687231, -1.076001, -1.278028,
+-1.568616, -1.535826, -1.170586, -1.323493,
+-1.159405, -1.150894, -2.487589, -1.248750,
+-1.441864, -1.432385, -1.205476, -1.490654,
+-1.876316, -1.521113, -1.071944, -1.251638,
+-1.161048, -2.804674, -1.315199, -1.027517,
+-0.827369, -2.981340, -2.560128, -0.832910,
+-1.146403, -2.931190, -1.015498, -1.321756,
+-1.453063, -2.971525, -0.983059, -1.076585,
+-15.516617, -15.516617, -0.000001, -15.516617,
+-13.721204, -13.721204, -0.000003, -13.721204,
+-15.352379, -15.352379, -0.000001, -15.352379,
+-15.588767, -15.588767, -0.000001, -15.588767,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.644598, -16.644598, -0.000000, -16.644598,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.644598, -16.644598, -16.644598, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.599447, -1.892994, -1.006152, -1.266741,
+-15.045153, -3.126755, -0.837599, -0.647425,
+-14.751605, -2.173966, -1.877701, -0.310156,
+-15.638446, -2.666903, -0.644280, -0.902653,
+-15.377857, -2.835309, -0.787619, -0.720779,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-13.955276, -13.955276, -0.000003, -13.955276,
+-15.802014, -15.802014, -0.000000, -15.802014,
+-15.954400, -15.954400, -0.000000, -15.954400,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.294373, -1.653514, -1.292220, -1.347483,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.149452, -2.114532, -1.191125, -1.352393,
+-0.960462, -1.755391, -2.522644, -1.010059,
+-1.165304, -1.837972, -1.151605, -1.546917,
+-1.742969, -1.452046, -1.116962, -1.333185,
+-1.287854, -1.700699, -1.196563, -1.429824,
+-1.061257, -1.476772, -2.530930, -1.061257,
+-1.454040, -1.521699, -1.171042, -1.435521,
+-1.759183, -1.583292, -1.072467, -1.271705,
+-1.230449, -1.725144, -1.289043, -1.369804,
+-1.201112, -1.344212, -2.271551, -1.092898,
+-1.459255, -1.673266, -1.113650, -1.379918,
+-1.774291, -1.736070, -1.055502, -1.183623,
+-1.420696, -1.583215, -1.289668, -1.280934,
+-1.040457, -1.333804, -2.766615, -1.138312,
+-1.335864, -1.770436, -1.149452, -1.386294,
+-1.793840, -1.413693, -1.199134, -1.241393,
+-1.312867, -1.820747, -1.098613, -1.445135,
+-1.148623, -1.325829, -2.866271, -1.020448,
+-1.189250, -1.726050, -1.151509, -1.602436,
+-1.690179, -1.532787, -1.177084, -1.233043},
+/*, don[7][]=don[8][]=NULL, */
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+/*, don[9][]=..., */
+{-1.304321, 0.000100, 0.000100, 0.000100,
+-1.532105, 0.000100, 0.000100, 0.000100,
+-1.609438, 0.000100, 0.000100, 0.000100,
+-1.162950, 0.000100, 0.000100, 0.000100,
+-1.132514, -1.686398, -1.167605, -1.706601,
+-0.927987, -1.327587, -2.480263, -1.363305,
+-1.215768, -1.401484, -1.215768, -1.827568,
+-1.505686, -1.409060, -1.185916, -1.477113,
+-1.129072, -1.069054, -1.441103, -2.336484,
+-0.812009, -0.969401, -2.674144, -2.227859,
+-0.991301, -1.114361, -1.434304, -2.772584,
+-1.255182, -0.810931, -1.669157, -2.499501,
+-0.000001, -15.079638, -15.079638, -15.079638,
+-0.000001, -15.115704, -15.115704, -15.115704,
+-0.000002, -14.441451, -14.441451, -14.441451,
+-0.000003, -13.676253, -13.676253, -13.676253,
+-16.113083, -16.113083, -0.000000, -16.113083,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.113083, -16.113083, -0.000000, -16.113083,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.113083, -16.113083, -16.113083, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000000, -16.113083, -16.113083, -16.113083,
+-0.000000, -16.113083, -16.113083, -16.113083,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.113083, -16.113083, -0.000000, -16.113083,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.121999, -1.726593, -1.482208, -1.311756,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.897942, -1.848917, -1.476678, -1.576051,
+-0.845417, -1.620801, -2.778248, -1.168817,
+-1.001695, -1.431551, -1.508512, -1.756973,
+-1.371479, -1.877414, -1.184268, -1.247182,
+-1.104199, -1.772448, -1.318974, -1.464482,
+-1.266672, -1.173146, -2.365280, -1.155446,
+-1.390913, -1.409605, -1.409605, -1.336846,
+-1.665608, -1.643629, -1.181812, -1.168206,
+-1.062112, -1.766069, -1.467577, -1.375204,
+-1.216396, -1.297741, -2.379543, -1.084819,
+-1.271771, -1.398926, -1.317581, -1.584643,
+-1.864448, -1.670293, -1.271386, -0.977147,
+-1.152680, -1.594512, -1.393842, -1.456362,
+-1.180984, -1.243504, -2.598045, -1.107959,
+-1.259543, -1.490654, -1.243795, -1.596014,
+-1.599216, -1.400088, -1.270035, -1.307306,
+-1.128466, -1.511457, -1.431415, -1.528264,
+-1.223776, -1.361397, -2.476535, -1.006363,
+-1.459054, -1.335001, -1.302211, -1.459054,
+-1.657255, -1.720434, -1.179628, -1.130236,
+-1.234746, 0.000100, 0.000100, 0.000100,
+-2.215564, 0.000100, 0.000100, 0.000100,
+-1.368276, 0.000100, 0.000100, 0.000100,
+-1.062896, 0.000100, 0.000100, 0.000100,
+-1.386294, -11.982954, -0.693160, -1.386294,
+-0.405507, -1.791726, -11.002167, -1.791726,
+-1.029628, -1.945889, -1.252766, -1.540440,
+-1.152684, -1.845814, -1.335002, -1.335002,
+-1.335002, -1.335002, -1.558141, -1.335002,
+-0.405507, -1.098629, -11.002167, -11.002167,
+-1.223779, -1.446918, -1.223779, -1.734591,
+-1.466334, -1.178661, -1.466334, -1.466334,
+-0.000018, -12.043577, -12.043577, -12.043577,
+-0.000020, -11.918417, -11.918417, -11.918417,
+-0.000025, -11.695280, -11.695280, -11.695280,
+-0.000027, -11.608273, -11.608273, -11.608273,
+-13.217681, -13.217681, -0.000005, -13.217681,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-13.217681, -13.217681, -0.000005, -13.217681,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-13.217681, -13.217681, -13.217681, -0.000005,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000005, -13.217681, -13.217681, -13.217681,
+-0.000005, -13.217681, -13.217681, -13.217681,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-13.217681, -13.217681, -0.000005, -13.217681,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.442383, -1.442383, -1.368276, -1.299284,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.466334, -11.775321, -0.773204, -1.178661,
+-0.773204, -2.564880, -2.564880, -0.955522,
+-1.029628, -2.638986, -1.540440, -1.029628,
+-2.014880, -2.014880, -1.098619, -0.916301,
+-0.980838, -1.673968, -1.386294, -1.673968,
+-0.693197, -0.693197, -10.596735, -10.596735,
+-0.628623, -1.321758, -2.014880, -2.707977,
+-1.049828, -2.995652, -1.386294, -1.049828,
+-1.056058, -1.526054, -1.189587, -2.036866,
+-0.223171, -11.512965, -11.512965, -1.609428,
+-2.397832, -1.299286, -0.788474, -1.704734,
+-1.704734, -1.704734, -1.011612, -1.299286,
+-1.335002, -2.251263, -0.998536, -1.335002,
+-0.693167, -1.609428, -11.512965, -1.203979,
+-1.163156, -2.079417, -0.980838, -1.673968,
+-1.203979, -1.203979, -1.203979, -2.302525,
+-0.944470, -2.197197, -1.504075, -1.280936,
+-0.810950, -2.197169, -11.407610, -0.810950,
+-1.163156, -1.386294, -0.826689, -11.982954,
+-1.791743, -1.386294, -1.791743, -0.875482},
+/*, don[10][]=..., */
+{-1.256549, 0.000100, 0.000100, 0.000100,
+-1.426557, 0.000100, 0.000100, 0.000100,
+-1.495300, 0.000100, 0.000100, 0.000100,
+-1.382105, 0.000100, 0.000100, 0.000100,
+-1.272123, -1.837015, -0.997687, -1.651613,
+-1.021263, -1.178655, -2.190254, -1.512857,
+-1.012538, -1.412861, -1.353142, -2.003728,
+-1.850295, -1.396379, -0.872130, -1.730151,
+-1.190420, -0.998529, -1.302211, -2.890368,
+-0.755553, -0.859232, -2.593829, -3.441120,
+-1.032473, -0.962947, -1.530053, -3.088192,
+-1.467874, -0.735507, -1.372564, -3.300445,
+-0.000001, -15.228934, -15.228934, -15.228934,
+-0.000001, -15.390358, -15.390358, -15.390358,
+-0.000001, -14.711600, -14.711600, -14.711600,
+-0.000006, -13.161592, -13.161592, -13.161592,
+-16.292889, -16.292889, -0.000000, -16.292889,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.292889, -16.292889, -0.000000, -16.292889,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.292889, -16.292889, -16.292889, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.292889, -1.988798, -0.227140, -2.713100,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000002, -14.304093, -14.304093, -14.304093,
+-0.000000, -16.065750, -16.065750, -16.065750,
+-0.000004, -13.579793, -13.579793, -13.579793,
+-16.292889, -16.292889, -0.000000, -16.292889,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.525720, -1.725962, -1.581290, -0.919234,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.294148, -1.772638, -1.057019, -1.567844,
+-1.066127, -1.549923, -1.549923, -1.464766,
+-1.131811, -1.252763, -1.358124, -2.004749,
+-1.435927, -1.781288, -0.911042, -1.652455,
+-1.211941, -1.486378, -1.374460, -1.499623,
+-1.487479, -1.094437, -2.180625, -1.119755,
+-1.358409, -1.368459, -1.209989, -1.659514,
+-2.246894, -1.512927, -1.094217, -1.081145,
+-1.323556, -1.500487, -1.310133, -1.422928,
+-1.593177, -1.010782, -2.424473, -1.066352,
+-1.298187, -1.482921, -1.111858, -1.765783,
+-2.133507, -1.239691, -0.980830, -1.527373,
+-1.498212, -1.480820, -1.210530, -1.382380,
+-1.255798, -1.245216, -2.215572, -1.145133,
+-1.695299, -1.278406, -1.055262, -1.661397,
+-1.956308, -1.435774, -1.105533, -1.239064,
+-1.347074, -1.609437, -1.044125, -1.671313,
+-1.161862, -1.342123, -2.153052, -1.172224,
+-1.513556, -1.240263, -1.079921, -1.890850,
+-1.757219, -1.415470, -1.181855, -1.280295,
+-1.352718, 0.000100, 0.000100, 0.000100,
+-1.431190, 0.000100, 0.000100, 0.000100,
+-1.279959, 0.000100, 0.000100, 0.000100,
+-1.494368, 0.000100, 0.000100, 0.000100,
+-1.572395, -1.572395, -1.137080, -1.331235,
+-1.058609, -1.493924, -2.793183, -1.001451,
+-0.952011, -1.845823, -1.098614, -2.097134,
+-0.938273, -2.036874, -1.343735, -1.526055,
+-1.128467, -1.734599, -1.128467, -1.734599,
+-0.971864, -0.971864, -2.224609, -2.001471,
+-1.734598, -0.753775, -1.629238, -1.852379,
+-1.326871, -1.326871, -0.947384, -2.505509,
+-0.000005, -13.270790, -13.270790, -13.270790,
+-0.000005, -13.353481, -13.353481, -13.353481,
+-0.000005, -13.217681, -13.217681, -13.217681,
+-0.000010, -12.577650, -12.577650, -12.577650,
+-14.533352, -14.533352, -0.000001, -14.533352,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-14.533352, -14.533352, -0.000001, -14.533352,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-14.533352, -14.533352, -14.533352, -0.000001,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-14.533352, -0.834374, -0.928562, -1.767661,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.000003, -13.698981, -13.698981, -13.698981,
+-0.000004, -13.604795, -13.604795, -13.604795,
+-0.000009, -12.765699, -12.765699, -12.765699,
+-14.533352, -14.533352, -0.000001, -14.533352,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.195876, -1.685423, -1.164127, -1.609437,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.949083, -1.488076, -1.642226, -1.642226,
+-0.641859, -1.691672, -2.251277, -1.691672,
+-0.980831, -1.856294, -1.519825, -1.386294,
+-1.767657, -1.634128, -1.315677, -1.005525,
+-1.321756, -1.919589, -1.021653, -1.484274,
+-0.955515, -1.584118, -12.873912, -0.890977,
+-1.228667, -1.634128, -1.228667, -1.516346,
+-1.832577, -1.714795, -1.203974, -1.021654,
+-1.116963, -2.215564, -1.174121, -1.368276,
+-0.639086, -2.197211, -2.890333, -1.185626,
+-0.944464, -2.379534, -1.791756, -1.044547,
+-1.529394, -1.696447, -1.049824, -1.386294,
+-1.372308, -1.623621, -1.777771, -0.966845,
+-1.178658, -1.871793, -2.564915, -0.773197,
+-1.326871, -1.183772, -1.406914, -1.694593,
+-1.679640, -1.369487, -1.880309, -0.899486,
+-0.967587, -1.832577, -1.078812, -2.120255,
+-0.875472, -1.568614, -2.772564, -1.163152,
+-1.609435, -1.358124, -0.916295, -1.945902,
+-1.711715, -1.018571, -1.280934, -1.711715},
+/*, don[11][]=don[12][]=[13][]=don[14][]=NULL, */
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+/*, don[15][]=..., */
+{-1.326871, 0.000100, 0.000100, 0.000100,
+-1.372427, 0.000100, 0.000100, 0.000100,
+-1.380938, 0.000100, 0.000100, 0.000100,
+-1.470283, 0.000100, 0.000100, 0.000100,
+-1.398515, -1.725198, -1.127362, -1.382254,
+-1.133099, -1.438480, -2.630614, -0.997924,
+-1.164862, -1.344447, -1.447988, -1.648658,
+-1.931988, -1.255102, -1.117481, -1.414732,
+-1.018889, -1.873878, -1.514505, -1.325914,
+-1.033230, -1.641477, -2.224621, -1.071944,
+-1.112218, -1.139998, -1.765090, -1.713797,
+-2.286453, -1.330944, -1.050985, -1.256837,
+-14.786291, -1.747307, -1.121602, -0.693148,
+-14.585621, -1.443453, -1.546637, -0.596156,
+-14.513647, -1.453157, -1.144422, -0.803496,
+-14.731803, -1.783790, -0.967585, -0.794074,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-14.436090, -14.436090, -0.000002, -14.436090,
+-14.883665, -14.883665, -0.000001, -14.883665,
+-15.328439, -15.328439, -0.000001, -15.328439,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.046600, -16.046600, -0.000000, -16.046600,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.046600, -16.046600, -16.046600, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.824992, -3.468961, -0.685657, -3.617380,
+-0.000001, -15.221608, -15.221608, -15.221608,
+-0.000010, -12.577650, -12.577650, -12.577650,
+-0.000001, -15.360944, -15.360944, -15.360944,
+-0.000012, -12.429232, -12.429232, -12.429232,
+-16.046600, -16.046600, -0.000000, -16.046600,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-16.046600, -16.046600, -16.046600, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.456362, -1.832313, -0.975473, -1.470283,
+-1.354546, -1.595707, -1.508696, -1.145791,
+-1.132746, -1.393028, -2.364884, -1.091924,
+-1.454067, -1.429969, -1.245666, -1.429969,
+-1.782456, -1.494775, -1.559313, -0.923325,
+-1.275543, -1.432385, -1.243795, -1.641477,
+-1.283755, -1.353959, -1.993995, -1.112797,
+-1.466337, -1.178655, -1.226665, -1.782189,
+-1.901759, -1.293513, -1.345472, -1.152101,
+-1.229452, -1.745668, -1.424761, -1.229452,
+-1.207965, -1.248787, -2.124254, -1.221388,
+-1.720149, -1.373279, -1.002906, -1.605080,
+-1.738793, -1.487479, -1.302076, -1.119755,
+-1.282754, -1.609437, -1.219241, -1.481604,
+-1.223776, -1.390830, -2.220107, -1.054358,
+-1.800492, -1.120592, -1.203973, -1.566878,
+-1.710533, -1.417547, -1.243194, -1.243194,
+-1.494775, 0.000100, 0.000100, 0.000100,
+-1.305533, 0.000100, 0.000100, 0.000100,
+-1.728388, 0.000100, 0.000100, 0.000100,
+-1.117482, 0.000100, 0.000100, 0.000100,
+-1.098616, -1.568613, -1.568613, -1.386294,
+-0.882395, -1.980990, -3.367210, -0.882395,
+-2.251263, -1.335002, -1.152684, -1.152684,
+-1.358124, -1.763583, -1.157455, -1.358124,
+-2.047681, -1.131405, -1.131405, -1.488076,
+-1.897107, -1.049828, -2.995652, -0.798517,
+-2.036866, -1.189587, -1.749192, -0.938276,
+-1.704744, -1.550595, -1.299284, -1.098615,
+-11.982954, -2.772514, -0.693160, -0.826689,
+-12.644341, -2.047681, -1.236764, -0.543623,
+-12.388411, -2.484873, -0.780166, -0.780166,
+-12.793871, -2.197211, -0.944466, -0.693153,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-11.608273, -11.608273, -0.000027, -11.608273,
+-12.948019, -12.948019, -0.000007, -12.948019,
+-13.199332, -13.199332, -0.000006, -13.199332,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-13.883173, -13.883173, -0.000003, -13.883173,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-13.883173, -13.883173, -13.883173, -0.000003,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.453953, -1.305533, -1.305533, -1.494775,
+-0.000012, -12.429232, -12.429232, -12.429232,
+-0.000010, -12.577650, -12.577650, -12.577650,
+-0.000010, -12.577650, -12.577650, -12.577650,
+-0.000012, -12.388411, -12.388411, -12.388411,
+-13.883173, -13.883173, -0.000003, -13.883173,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-13.883173, -13.883173, -13.883173, -0.000003,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.677095, -2.274928, -0.801629, -1.340624,
+-0.916298, -1.609433, -1.609433, -1.609433,
+-1.299286, -1.299286, -1.299286, -1.704734,
+-1.163152, -2.079433, -1.568614, -1.037990,
+-1.945899, -0.847304, -1.722761, -1.386294,
+-1.321757, -1.098616, -1.098616, -2.708014,
+-1.609434, -1.139438, -1.832572, -1.139438,
+-1.299284, -1.704741, -1.704741, -1.011607,
+-2.014891, -1.098616, -1.321757, -1.321757,
+-1.526054, -1.749192, -1.749192, -0.832916,
+-1.163153, -1.386294, -1.856290, -1.268513,
+-0.773197, -2.564915, -1.466336, -1.466336,
+-1.648654, -1.178658, -1.466336, -1.312187,
+-1.856290, -1.268513, -0.900791, -1.856290,
+-1.011607, -1.145136, -3.090961, -1.299284,
+-1.435084, -1.945896, -0.847306, -1.658222,
+-1.268513, -1.673972, -1.067844, -1.673972},
+/*, don[16][]=..., */
+{-1.390956, 0.000100, 0.000100, 0.000100,
+-1.318853, 0.000100, 0.000100, 0.000100,
+-1.354255, 0.000100, 0.000100, 0.000100,
+-1.489074, 0.000100, 0.000100, 0.000100,
+-1.191589, -1.395684, -1.376992, -1.628306,
+-1.007263, -1.343735, -2.302583, -1.294945,
+-1.168571, -1.033230, -1.713797, -1.876316,
+-1.866660, -1.108976, -1.124724, -1.656940,
+-1.013620, -1.402555, -1.752228, -1.524444,
+-0.800246, -1.434304, -2.654802, -1.418043,
+-0.798509, -1.301137, -1.825660, -2.148432,
+-1.559566, -1.258461, -1.219241, -1.559566,
+-14.987994, -1.500986, -1.095521, -0.814808,
+-14.608505, -1.569521, -1.636962, -0.515362,
+-14.166170, -1.218158, -1.004584, -1.084627,
+-14.369398, -1.603707, -0.896376, -0.939548,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-14.483342, -14.483342, -0.000002, -14.483342,
+-14.823470, -14.823470, -0.000001, -14.823470,
+-15.179049, -15.179049, -0.000001, -15.179049,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.967274, -15.967274, -0.000000, -15.967274,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.967274, -15.967274, -15.967274, -0.000000,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.593618, -4.191976, -0.848850, -5.370614,
+-0.000001, -15.373656, -15.373656, -15.373656,
+-0.000023, -11.775321, -11.775321, -11.775321,
+-0.000001, -15.118424, -15.118424, -15.118424,
+-0.000075, -10.596735, -10.596735, -10.596735,
+-15.967274, -15.967274, -0.000000, -15.967274,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.957840, -1.219598, -1.136532, -15.967273,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.098613, -1.828800, -1.135654, -1.688218,
+-0.864506, -1.426460, -2.103345, -1.530001,
+-0.849717, -1.343735, -1.493266, -2.442344,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.383314, -1.208961, -1.336794, -1.670995,
+-1.255455, -1.082613, -2.281305, -1.293921,
+-1.443157, -1.182431, -1.604425, -1.361479,
+-1.945908, -1.722765, -0.847299, -1.386294,
+-1.143733, -1.489478, -1.511457, -1.446919,
+-1.283347, -1.123004, -2.426408, -1.173648,
+-1.420696, -1.221845, -1.056053, -2.154663,
+-2.040219, -1.560647, -1.049823, -1.171183,
+-1.475906, -1.562918, -1.252763, -1.286665,
+-1.218572, -1.263692, -2.393554, -1.107346,
+-1.786985, -1.581134, -0.864998, -1.581134,
+-2.173802, -1.480657, -0.982411, -1.257514,
+-1.232961, -1.380596, -1.358124, -1.609437,
+-1.366492, -1.220780, -1.864329, -1.220780,
+-1.402555, -1.524444, -1.002570, -1.775758,
+-1.830473, -1.324926, -1.191394, -1.309177,
+-1.189987, 0.000100, 0.000100, 0.000100,
+-1.623622, 0.000100, 0.000100, 0.000100,
+-1.465398, 0.000100, 0.000100, 0.000100,
+-1.318241, 0.000100, 0.000100, 0.000100,
+-1.463255, -1.504077, -1.248145, -1.349927,
+-1.203974, -1.203974, -3.149855, -1.029621,
+-1.410987, -1.410987, -1.228666, -1.516347,
+-1.558144, -1.914817, -1.057371, -1.221673,
+-1.020362, -2.056448, -1.196252, -1.563975,
+-1.002154, -1.479076, -2.983134, -1.037245,
+-1.291984, -1.677645, -1.178656, -1.466337,
+-1.599387, -1.704747, -1.039773, -1.337023,
+-13.864305, -2.862188, -0.741939, -0.762142,
+-13.369229, -2.212965, -2.549431, -0.207644,
+-13.742944, -2.923146, -0.819029, -0.682454,
+-13.742944, -2.923146, -0.771401, -0.725939,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-12.345852, -12.345852, -0.000013, -12.345852,
+-14.144817, -14.144817, -0.000002, -14.144817,
+-14.473033, -14.473033, -0.000002, -14.473033,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.082459, -15.082459, -0.000001, -15.082459,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-15.082459, -15.082459, -15.082459, -0.000001,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.667425, -1.199289, -1.127186, -1.682463,
+-0.000004, -13.415039, -13.415039, -13.415039,
+-0.000003, -13.883173, -13.883173, -13.883173,
+-0.000003, -13.955276, -13.955276, -13.955276,
+-0.000005, -13.400002, -13.400002, -13.400002,
+-15.082459, -15.082459, -0.000001, -15.082459,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-0.981769, -1.490091, -0.916291, -15.082459,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.201470, -1.799305, -1.040202, -1.712294,
+-0.798510, -1.817075, -2.590255, -1.163152,
+-1.218158, -1.820331, -1.105680, -1.554629,
+-1.386294, -1.386294, -1.386294, -1.386294,
+-1.159767, -2.131624, -1.133099, -1.403389,
+-0.969403, -1.662545, -2.961804, -0.969403,
+-1.108664, -1.609437, -1.139435, -1.897117,
+-1.536234, -1.425009, -1.191395, -1.425009,
+-1.053763, -1.918757, -1.290151, -1.472472,
+-0.863049, -1.519825, -3.465692, -1.114362,
+-1.439216, -1.483668, -1.108976, -1.578978,
+-2.044752, -1.446919, -1.041455, -1.264598,
+-1.098613, -1.887067, -1.459625, -1.262916,
+-0.780161, -1.791757, -4.276572, -1.018571,
+-1.211091, -1.904235, -1.176000, -1.407800,
+-1.609437, -1.666595, -1.167606, -1.203974,
+-1.317302, -2.233588, -0.957300, -1.422662,
+-1.170073, -1.227231, -2.961804, -1.064713,
+-1.349927, -1.349927, -1.561235, -1.303407,
+-1.508896, -1.553348, -1.147884, -1.386294},
+/*, don[17..24][]=NULL, */
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+};
diff --git a/libsim4/sim4core/sites_donor.H b/libsim4/sim4core/sites_donor.H
new file mode 100644
index 0000000..146943d
--- /dev/null
+++ b/libsim4/sim4core/sites_donor.H
@@ -0,0 +1,11 @@
+#ifndef SITES_DONOR_H
+#define SITES_DONOR_H
+
+/* DO NOT REMOVE or MODIFY !!!! */
+
+#define NUM_MODELS_DON 25
+#define NUM_VALUES_DON 928
+
+extern double don[NUM_MODELS_DON][NUM_VALUES_DON];
+
+#endif /* SITES_DONOR_H */
diff --git a/libsim4/sim4core/sites_score.C b/libsim4/sim4core/sites_score.C
new file mode 100644
index 0000000..28e673d
--- /dev/null
+++ b/libsim4/sim4core/sites_score.C
@@ -0,0 +1,2572 @@
+#include "sim4.H"
+
+/* DO NOT REMOVE or MODIFY !!!! */
+
+double score_ex_acc[NUM_VALUES_SCORES] = {
+-1.335601, 0.000100, 0.000100, 0.000100,
+-1.971125, 0.000100, 0.000100, 0.000100,
+-0.730849, 0.000100, 0.000100, 0.000100,
+-2.152442, 0.000100, 0.000100, 0.000100,
+-1.384016, -1.765491, -1.670181, -0.941316,
+-1.355893, -1.392776, -2.624094, -0.864190,
+-1.345398, -1.730732, -1.729560, -0.954391,
+-2.123424, -1.682619, -0.996441, -1.123004,
+-1.065167, -1.683004, -1.656570, -1.277539,
+-1.150894, -1.393289, -2.468421, -1.047909,
+-1.235876, -1.477038, -1.500624, -1.354346,
+-1.750241, -1.494114, -1.103043, -1.309478,
+-1.210955, -1.456643, -1.668048, -1.271341,
+-1.206852, -1.229877, -2.389307, -1.149356,
+-1.280156, -1.436382, -1.367875, -1.471539,
+-1.900284, -1.328006, -1.151218, -1.312237,
+-1.143372, -1.554471, -1.672254, -1.265377,
+-1.180706, -1.344646, -2.311599, -1.099003,
+-1.071127, -1.494756, -1.523684, -1.536342,
+-1.845461, -1.444636, -1.186455, -1.200931,
+-1.234519, -1.538041, -1.343193, -1.455803,
+-1.159987, -1.274190, -2.230314, -1.206116,
+-1.275844, -1.464866, -1.346847, -1.471256,
+-1.854969, -1.410742, -1.038832, -1.403594,
+-1.186122, -1.694813, -1.220024, -1.533681,
+-1.062437, -1.260450, -2.195157, -1.348922,
+-1.156730, -1.458416, -1.401923, -1.576199,
+-1.828613, -1.491434, -0.984148, -1.424800,
+-1.192180, -1.609438, -1.330244, -1.460895,
+-1.069538, -1.468074, -2.133364, -1.177571,
+-1.067348, -1.498477, -1.562900, -1.500212,
+-1.915968, -1.360152, -1.186611, -1.234690,
+-1.190480, -1.676417, -1.238583, -1.518284,
+-1.077172, -1.353662, -2.376455, -1.176777,
+-1.208407, -1.452132, -1.411385, -1.498562,
+-1.977461, -1.423799, -1.011984, -1.357570,
+-1.253323, -1.614677, -1.158607, -1.601630,
+-1.108317, -1.231266, -2.217624, -1.312688,
+-1.138353, -1.494677, -1.305072, -1.691845,
+-1.863840, -1.414671, -1.023563, -1.416317,
+-1.230936, -1.592661, -1.327380, -1.429510,
+-1.138540, -1.337023, -2.174752, -1.192535,
+-1.071345, -1.557398, -1.441982, -1.559172,
+-1.962656, -1.361470, -1.098164, -1.310265,
+-1.200985, -1.614838, -1.205770, -1.605854,
+-1.129803, -1.301737, -2.173714, -1.234123,
+-1.214022, -1.466760, -1.364150, -1.529103,
+-1.836928, -1.439748, -0.999938, -1.444753,
+-1.220956, -1.681434, -1.208440, -1.512845,
+-1.138898, -1.273579, -2.212360, -1.235952,
+-1.117383, -1.493132, -1.369361, -1.640269,
+-1.915643, -1.392717, -0.986107, -1.463921,
+-1.159009, -1.639743, -1.295528, -1.521335,
+-1.113767, -1.455226, -2.176386, -1.124294,
+-1.091340, -1.459191, -1.506578, -1.559971,
+-1.815721, -1.420894, -1.168590, -1.255349,
+-1.262891, -1.574610, -1.245838, -1.503383,
+-1.123450, -1.295129, -2.189671, -1.241233,
+-1.207579, -1.398026, -1.392795, -1.581747,
+-1.963610, -1.369669, -1.005964, -1.428092,
+-1.277933, -1.569773, -1.202048, -1.547794,
+-1.089207, -1.209741, -2.155384, -1.388762,
+-1.195776, -1.443453, -1.370980, -1.572326,
+-1.888331, -1.477326, -0.951149, -1.451870,
+-1.182852, -1.643904, -1.297299, -1.482349,
+-1.122438, -1.418616, -2.104960, -1.169200,
+-1.065810, -1.458160, -1.525526, -1.582889,
+-1.798883, -1.443818, -1.085828, -1.343735,
+-1.295488, -1.631455, -1.188992, -1.486942,
+-1.079254, -1.308955, -2.221961, -1.267081,
+-1.229971, -1.438389, -1.328760, -1.581902,
+-1.940308, -1.461267, -0.939430, -1.454370,
+-1.229625, -1.588928, -1.214000, -1.577713,
+-1.103814, -1.213966, -2.208966, -1.341072,
+-1.128032, -1.511629, -1.317179, -1.671899,
+-1.788370, -1.349624, -1.049337, -1.499419,
+-1.184555, -1.584789, -1.335215, -1.487110,
+-1.102734, -1.384240, -2.140066, -1.204384,
+-1.080200, -1.448178, -1.446532, -1.660263,
+-1.888446, -1.298977, -1.120866, -1.386753,
+-1.257125, -1.594577, -1.209326, -1.541142,
+-1.120647, -1.277587, -2.190125, -1.261181,
+-1.143122, -1.499606, -1.406558, -1.546581,
+-1.874724, -1.458448, -0.977085, -1.437171,
+-1.237747, -1.567878, -1.247977, -1.539953,
+-1.067333, -1.325089, -2.162207, -1.290165,
+-1.171568, -1.424301, -1.258353, -1.799866,
+-1.961282, -1.384601, -1.057113, -1.341548,
+-1.219381, -1.569864, -1.269042, -1.535199,
+-1.125630, -1.395079, -2.138194, -1.171600,
+-1.154665, -1.476627, -1.399043, -1.562569,
+-1.892284, -1.352070, -1.121797, -1.328500,
+-1.273174, -1.577905, -1.180112, -1.577905,
+-1.151614, -1.281105, -2.195094, -1.221474,
+-1.197549, -1.459232, -1.345588, -1.583452,
+-1.890951, -1.487270, -0.960853, -1.424966,
+-1.230290, -1.671687, -1.193083, -1.529807,
+-1.132470, -1.286841, -2.025846, -1.310298,
+-1.164790, -1.456458, -1.331953, -1.655532,
+-1.802441, -1.479312, -0.948904, -1.513471,
+-1.234540, -1.613452, -1.356870, -1.376871,
+-1.079244, -1.383257, -2.142677, -1.230841,
+-1.099347, -1.514228, -1.403705, -1.603214,
+-1.910664, -1.345006, -1.051746, -1.418188,
+-1.232042, -1.624815, -1.189179, -1.575595,
+-1.141172, -1.274703, -2.270131, -1.211524,
+-1.238230, -1.446198, -1.316615, -1.576981,
+-1.967337, -1.303777, -1.028560, -1.464708,
+-1.275878, -1.554886, -1.227546, -1.529750,
+-1.087590, -1.247032, -2.241284, -1.311952,
+-1.149444, -1.505604, -1.361816, -1.584215,
+-1.890155, -1.479980, -0.986981, -1.391873,
+-1.205927, -1.612003, -1.319745, -1.452885,
+-1.154726, -1.309772, -2.180861, -1.197227,
+-1.054008, -1.504077, -1.491871, -1.588222,
+-1.922668, -1.377356, -1.103326, -1.310173,
+-1.290627, -1.580910, -1.184747, -1.545100,
+-1.120620, -1.303989, -2.160765, -1.247442,
+-1.205392, -1.404328, -1.353642, -1.626606,
+-1.845827, -1.400752, -1.009410, -1.464213,
+-1.199175, -1.618591, -1.258856, -1.530622,
+-1.111575, -1.293897, -2.191425, -1.255259,
+-1.133596, -1.475954, -1.370924, -1.631207,
+-1.873750, -1.450602, -0.992430, -1.421509,
+-1.206243, -1.619514, -1.287183, -1.484401,
+-1.175333, -1.297996, -2.227741, -1.169785,
+-1.135417, -1.415790, -1.458772, -1.592304,
+-1.882364, -1.357552, -1.108091, -1.345953,
+-1.312815, -1.592356, -1.176732, -1.517733,
+-1.129410, -1.324668, -2.165735, -1.216655,
+-1.214091, -1.415233, -1.324486, -1.638808,
+-1.827456, -1.439690, -0.952614, -1.530419,
+-1.230672, -1.569762, -1.202320, -1.612880,
+-1.120874, -1.305796, -2.183768, -1.236386,
+-1.100460, -1.534065, -1.319174, -1.691360,
+-1.848282, -1.382408, -1.009656, -1.482037,
+-1.192138, -1.600549, -1.333431, -1.465028,
+-1.112761, -1.336876, -2.204917, -1.209237,
+-1.171048, -1.447301, -1.457089, -1.505797,
+-1.964450, -1.340752, -1.128628, -1.292288,
+-1.270576, -1.656773, -1.154380, -1.544945,
+-1.084854, -1.344365, -2.158925, -1.252133,
+-1.142498, -1.604726, -1.280848, -1.598336,
+-1.943552, -1.419084, -0.981092, -1.427248,
+-1.261372, -1.577526, -1.230282, -1.523079,
+-1.154926, -1.264728, -2.117872, -1.264728,
+-1.135020, -1.492831, -1.286715, -1.727814,
+-2.002417, -1.366058, -0.948974, -1.501738,
+-1.179219, -1.592662, -1.325051, -1.499202,
+-1.137299, -1.365658, -2.198894, -1.160985,
+-1.143767, -1.502423, -1.379694, -1.574372,
+-1.753819, -1.421064, -1.167477, -1.293604,
+-1.289000, -1.635497, -1.155149, -1.539037,
+-1.159237, -1.269238, -2.193310, -1.225258,
+-1.192264, -1.462796, -1.356516, -1.573481,
+-1.875383, -1.414488, -0.972655, -1.489099,
+-1.201797, -1.596451, -1.251610, -1.557376,
+-1.086788, -1.345506, -2.132659, -1.259583,
+-1.161896, -1.554457, -1.231563, -1.692996,
+-1.835473, -1.353477, -1.010639, -1.522640,
+-1.200165, -1.544028, -1.386294, -1.446740,
+-1.074299, -1.377696, -2.040487, -1.286129,
+-1.141172, -1.448234, -1.451434, -1.554304,
+-1.895922, -1.393515, -1.119513, -1.290330,
+-1.278099, -1.589413, -1.202082, -1.528686,
+-1.133991, -1.260209, -2.205311, -1.257224,
+-1.209663, -1.363576, -1.343806, -1.686825,
+-1.809300, -1.416258, -1.049302, -1.412930,
+-1.232036, -1.563347, -1.239828, -1.563347,
+-1.142506, -1.268012, -2.052407, -1.305025,
+-1.153453, -1.503497, -1.309649, -1.649278,
+-1.905347, -1.401502, -1.026262, -1.399752,
+-1.268921, -1.569353, -1.282114, -1.455708,
+-1.132240, -1.336090, -2.353680, -1.140062,
+-1.142072, -1.493786, -1.405637, -1.555371,
+-1.903114, -1.356570, -1.131052, -1.306891,
+-1.258505, -1.542294, -1.232830, -1.557960,
+-1.132543, -1.340082, -2.076069, -1.236012,
+-1.208791, -1.443004, -1.352110, -1.577305,
+-1.834336, -1.406362, -1.049210, -1.406362,
+-1.202462, -1.581855, -1.280934, -1.531665,
+-1.143636, -1.378359, -2.103631, -1.179809,
+-1.113755, -1.533405, -1.290459, -1.710739,
+-1.889009, -1.370414, -1.001542, -1.480762,
+-1.170553, -1.576018, -1.343132, -1.505163,
+-1.124063, -1.364014, -2.263498, -1.153836,
+-1.103987, -1.434931, -1.465954, -1.612121,
+-1.806562, -1.406374, -1.123997, -1.325111,
+-1.293344, -1.609088, -1.144132, -1.574661,
+-1.146531, -1.243449, -2.237143, -1.247883,
+-1.280330, -1.423670, -1.249746, -1.636561,
+-1.869786, -1.379057, -1.007272, -1.474848,
+-1.231473, -1.538100, -1.246328, -1.580898,
+-1.067429, -1.358306, -2.190256, -1.247839,
+-1.189061, -1.519773, -1.234607, -1.683179,
+-1.853150, -1.360364, -1.029269, -1.472192,
+-1.160772, -1.650320, -1.338037, -1.459265,
+-1.165275, -1.321014, -2.227810, -1.159847,
+-1.137573, -1.486277, -1.449789, -1.520645,
+-1.786011, -1.433047, -1.155640, -1.276774,
+-1.273726, -1.526127, -1.221862, -1.569467,
+-1.124703, -1.297393, -2.251947, -1.214651,
+-1.184638, -1.402477, -1.378085, -1.629154,
+-1.885832, -1.346835, -1.049159, -1.435472,
+-1.253333, -1.548590, -1.209055, -1.592472,
+-1.097402, -1.366318, -2.159215, -1.218061,
+-1.162369, -1.517260, -1.245751, -1.713583,
+-1.815168, -1.450890, -0.992609, -1.460073,
+-1.191243, -1.508802, -1.325115, -1.564419,
+-1.097743, -1.384122, -2.183401, -1.193444,
+-1.184943, -1.467336, -1.449636, -1.473850,
+-1.916834, -1.387616, -1.119975, -1.283844,
+-1.260960, -1.606178, -1.170434, -1.581176,
+-1.196543, -1.252552, -2.121249, -1.230637,
+-1.272663, -1.474848, -1.349685, -1.461968,
+-1.949495, -1.398697, -0.974619, -1.455003,
+-1.202384, -1.580450, -1.249786, -1.574676,
+-1.130482, -1.275179, -2.250038, -1.229816,
+-1.194145, -1.477760, -1.278271, -1.658784,
+-1.858725, -1.448801, -1.029202, -1.378386,
+-1.227165, -1.538118, -1.269452, -1.555390,
+-1.190249, -1.377870, -2.119136, -1.128235,
+-1.148682, -1.413005, -1.483933, -1.547446,
+-1.819087, -1.369744, -1.154035, -1.315677,
+-1.321019, -1.563689, -1.154709, -1.567216,
+-1.173361, -1.290303, -2.163656, -1.202032,
+-1.192784, -1.419760, -1.323339, -1.668087,
+-1.892492, -1.354595, -1.037684, -1.439787,
+-1.305802, -1.530746, -1.195561, -1.559998,
+-1.105983, -1.283630, -2.251292, -1.248895,
+-1.250128, -1.527339, -1.251445, -1.559087,
+-1.742785, -1.399683, -1.071944, -1.444134,
+-1.193063, -1.585055, -1.331716, -1.479512,
+-1.163570, -1.339203, -2.174835, -1.164913,
+-1.124019, -1.557180, -1.384367, -1.542634,
+-1.748455, -1.361913, -1.180146, -1.337302,
+-1.340001, -1.599512, -1.122041, -1.558056,
+-1.103363, -1.326506, -2.220324, -1.223259,
+-1.228985, -1.431598, -1.361062, -1.550719,
+-1.837148, -1.435005, -1.052543, -1.372132,
+-1.254306, -1.541988, -1.261086, -1.525902,
+-1.117554, -1.319094, -2.046308, -1.286150,
+-1.211171, -1.436218, -1.329430, -1.610949,
+-1.834968, -1.413187, -1.018570, -1.444331,
+-1.206837, -1.571380, -1.310536, -1.498684,
+-1.141625, -1.367710, -2.140900, -1.176157,
+-1.112502, -1.492081, -1.396306, -1.615202,
+-1.834206, -1.329906, -1.203973, -1.288233,
+-1.302269, -1.576013, -1.190351, -1.527059,
+-1.130388, -1.362076, -2.204436, -1.169086,
+-1.221474, -1.475480, -1.298146, -1.592098,
+-1.922147, -1.421787, -0.983639, -1.433483,
+-1.243504, -1.549269, -1.224861, -1.582668,
+-1.098612, -1.287155, -2.220755, -1.265549,
+-1.158097, -1.535490, -1.321383, -1.590615,
+-1.736951, -1.484592, -1.054449, -1.390445,
+-1.225500, -1.572788, -1.353111, -1.425084,
+-1.159183, -1.265156, -2.313011, -1.187118,
+-1.139970, -1.466987, -1.430498, -1.558924,
+-1.770974, -1.352993, -1.177049, -1.334915,
+-1.329297, -1.460681, -1.224210, -1.564136,
+-1.173926, -1.349868, -2.237171, -1.124501,
+-1.312813, -1.406658, -1.307939, -1.534329,
+-1.874969, -1.419242, -1.065185, -1.346770,
+-1.295130, -1.600617, -1.163070, -1.551921,
+-1.120564, -1.309440, -2.224214, -1.218138,
+-1.234378, -1.477862, -1.303851, -1.563628,
+-1.764318, -1.375227, -1.029143, -1.520507,
+-1.226343, -1.571334, -1.319214, -1.463074,
+-1.129765, -1.313388, -2.257403, -1.192831,
+-1.240312, -1.464441, -1.438507, -1.417921,
+-1.746526, -1.383727, -1.192895, -1.303290,
+-1.267040, -1.632153, -1.169949, -1.549073,
+-1.098612, -1.328065, -2.313123, -1.194929,
+-1.250583, -1.473345, -1.291024, -1.562958,
+-1.872377, -1.422208, -1.005675, -1.430189,
+-1.245719, -1.529425, -1.197751, -1.641273,
+-1.075996, -1.352767, -2.179684, -1.246739,
+-1.224076, -1.500329, -1.247333, -1.631459,
+-1.780586, -1.447647, -1.031035, -1.428366,
+-1.170722, -1.518832, -1.359915, -1.540811,
+-1.115529, -1.318535, -2.164227, -1.238245,
+-1.159120, -1.415898, -1.463225, -1.550971,
+-1.787364, -1.388941, -1.141364, -1.332273,
+-1.286750, -1.553734, -1.175390, -1.591217,
+-1.135847, -1.339364, -2.260368, -1.163069,
+-1.276618, -1.450602, -1.337339, -1.495893,
+-1.848166, -1.411771, -1.074574, -1.357436,
+-1.228730, -1.580892, -1.182330, -1.634855,
+-1.096925, -1.237889, -2.256794, -1.304078,
+-1.243246, -1.480356, -1.281871, -1.577555,
+-1.797017, -1.415353, -1.035689, -1.442291,
+-1.187680, -1.535886, -1.348129, -1.513757,
+-1.178655, -1.273560, -2.247971, -1.181361,
+-1.198796, -1.414411, -1.414411, -1.549730,
+-1.771203, -1.384087, -1.159182, -1.325852,
+-1.243145, -1.632309, -1.146801, -1.617494,
+-1.140095, -1.302539, -2.213612, -1.207093,
+-1.301889, -1.451267, -1.324471, -1.479438,
+-1.912758, -1.387901, -1.098211, -1.312072,
+-1.201256, -1.644238, -1.203843, -1.579951,
+-1.168149, -1.306349, -2.174667, -1.188740,
+-1.245053, -1.580450, -1.254506, -1.509705,
+-1.743662, -1.432584, -1.062129, -1.424159,
+-1.207826, -1.608322, -1.315542, -1.458429,
+-1.217778, -1.313388, -2.155218, -1.141537,
+-1.197917, -1.414791, -1.355638, -1.622430,
+-1.781883, -1.448575, -1.104835, -1.326108,
+-1.242346, -1.624130, -1.207970, -1.535223,
+-1.145132, -1.261081, -2.338921, -1.197199,
+-1.322975, -1.449561, -1.242463, -1.559160,
+-1.821286, -1.404393, -1.073035, -1.383508,
+-1.219449, -1.624259, -1.193574, -1.587610,
+-1.061617, -1.356589, -2.172409, -1.263499,
+-1.199487, -1.456113, -1.311691, -1.628894,
+-1.681399, -1.481933, -1.048532, -1.442921,
+-1.245178, -1.547315, -1.267568, -1.524287,
+-1.154946, -1.309561, -2.203378, -1.188896,
+-1.205380, -1.465698, -1.390523, -1.511585,
+-1.763734, -1.361676, -1.202662, -1.302056,
+-1.126271, 0.000100, 0.000100, 0.000100,
+-1.600280, 0.000100, 0.000100, 0.000100,
+-1.440370, 0.000100, 0.000100, 0.000100,
+-1.439358, 0.000100, 0.000100, 0.000100,
+-1.037596, -1.784752, -1.368686, -1.498725,
+-0.969139, -1.430044, -2.783348, -1.141120,
+-1.132712, -1.548982, -1.416992, -1.500874,
+-1.447700, -1.632824, -1.309515, -1.205436,
+-1.040898, -1.799518, -1.386480, -1.463045,
+-0.963590, -1.488005, -2.767785, -1.109100,
+-1.055648, -1.635182, -1.463108, -1.488989,
+-1.426120, -1.636021, -1.327101, -1.204766,
+-1.129552, -1.740917, -1.366506, -1.400531,
+-0.971197, -1.487555, -2.855071, -1.085022,
+-1.193866, -1.526188, -1.376560, -1.482169,
+-1.548347, -1.605485, -1.326118, -1.135993,
+-1.100949, -1.810472, -1.366225, -1.391107,
+-0.983656, -1.435603, -2.817303, -1.113768,
+-1.088187, -1.674189, -1.373706, -1.502620,
+-1.486649, -1.650060, -1.316997, -1.158701,
+-1.122221, -1.787219, -1.319179, -1.428517,
+-0.984662, -1.476402, -2.714434, -1.103423,
+-1.187851, -1.653014, -1.329156, -1.431494,
+-1.466866, -1.649626, -1.339846, -1.154087,
+-1.090029, -1.768602, -1.397145, -1.402749,
+-0.972428, -1.446867, -2.775311, -1.126395,
+-1.155890, -1.654090, -1.350847, -1.448442,
+-1.461769, -1.657078, -1.307973, -1.180544,
+-1.092893, -1.805843, -1.348418, -1.423921,
+-1.006053, -1.478381, -2.720757, -1.077313,
+-1.207902, -1.588394, -1.355462, -1.430894,
+-1.477705, -1.621565, -1.363536, -1.144147,
+-1.085392, -1.777565, -1.383597, -1.416728,
+-0.951430, -1.464386, -2.832186, -1.127914,
+-1.109729, -1.685765, -1.366911, -1.468999,
+-1.514412, -1.633126, -1.334034, -1.135280,
+-1.147771, -1.716052, -1.364878, -1.396465,
+-0.972880, -1.463530, -2.865694, -1.097676,
+-1.174990, -1.626847, -1.360646, -1.434919,
+-1.501165, -1.583283, -1.350575, -1.162388,
+-1.118144, -1.755150, -1.392228, -1.379472,
+-0.983027, -1.461923, -2.825658, -1.094331,
+-1.125699, -1.651206, -1.397985, -1.441125,
+-1.457759, -1.599563, -1.399085, -1.144337,
+-1.129659, -1.750671, -1.333735, -1.428334,
+-0.992917, -1.431679, -2.847736, -1.100747,
+-1.188522, -1.536829, -1.395296, -1.458834,
+-1.510725, -1.620993, -1.316081, -1.160392,
+-1.101488, -1.754472, -1.402788, -1.391449,
+-0.994619, -1.444694, -2.816815, -1.095055,
+-1.152710, -1.615064, -1.396846, -1.435628,
+-1.485824, -1.621041, -1.352133, -1.147896,
+-1.108305, -1.788701, -1.355932, -1.406506,
+-0.994473, -1.453574, -2.819786, -1.088475,
+-1.196367, -1.594841, -1.376259, -1.417797,
+-1.486754, -1.611917, -1.368832, -1.139478,
+-1.132685, -1.734931, -1.375451, -1.391528,
+-1.007601, -1.384571, -2.824122, -1.123579,
+-1.162633, -1.591122, -1.406459, -1.432832,
+-1.486578, -1.579927, -1.353164, -1.172981,
+-1.110632, -1.786596, -1.370098, -1.390152,
+-1.007251, -1.425332, -2.856373, -1.088050,
+-1.204719, -1.582583, -1.362712, -1.432069,
+-1.505046, -1.627606, -1.348084, -1.133650,
+-1.099678, -1.756700, -1.419732, -1.375831,
+-1.009766, -1.425926, -2.825292, -1.090292,
+-1.148516, -1.630048, -1.380470, -1.445992,
+-1.500193, -1.563789, -1.395807, -1.139627,
+-1.119926, -1.748937, -1.379355, -1.394341,
+-0.974565, -1.408163, -2.893751, -1.131085,
+-1.219123, -1.561759, -1.390892, -1.402868,
+-1.500151, -1.636566, -1.331887, -1.144831,
+-1.107160, -1.728876, -1.385630, -1.419421,
+-0.970204, -1.452216, -2.748959, -1.130262,
+-1.181169, -1.621320, -1.362764, -1.429242,
+-1.485533, -1.616611, -1.371334, -1.135444,
+-1.137082, -1.763959, -1.346103, -1.395824,
+-0.998285, -1.427183, -2.817670, -1.103343,
+-1.180518, -1.596952, -1.390117, -1.421625,
+-1.489618, -1.610295, -1.386794, -1.124430,
+-1.116263, -1.745481, -1.405639, -1.375443,
+-1.008551, -1.394440, -2.866718, -1.107465,
+-1.154666, -1.592375, -1.399592, -1.449457,
+-1.471385, -1.621312, -1.360288, -1.151486,
+-1.115641, -1.797624, -1.373824, -1.372527,
+-1.010982, -1.420947, -2.803558, -1.096436,
+-1.210434, -1.563439, -1.365486, -1.438595,
+-1.528921, -1.560112, -1.348835, -1.159715,
+-1.102655, -1.749688, -1.418201, -1.378191,
+-1.000249, -1.423572, -2.820570, -1.103259,
+-1.144354, -1.599328, -1.398657, -1.458393,
+-1.468434, -1.591596, -1.380171, -1.156523,
+-1.117323, -1.757429, -1.356246, -1.415799,
+-1.003464, -1.390065, -2.808274, -1.126936,
+-1.215629, -1.616261, -1.338513, -1.416336,
+-1.488300, -1.606673, -1.360643, -1.148241,
+-1.126311, -1.743883, -1.404856, -1.364414,
+-0.979704, -1.429952, -2.819989, -1.121930,
+-1.163389, -1.603140, -1.353596, -1.478405,
+-1.493600, -1.620567, -1.355430, -1.140015,
+-1.129339, -1.750269, -1.398605, -1.362247,
+-1.015107, -1.408231, -2.826934, -1.097014,
+-1.198853, -1.590029, -1.394071, -1.400487,
+-1.498766, -1.583641, -1.420240, -1.109537,
+-1.111094, -1.732346, -1.420046, -1.377417,
+-1.010322, -1.416869, -2.811313, -1.098711,
+-1.183085, -1.603880, -1.410708, -1.391980,
+-1.511164, -1.616073, -1.349504, -1.135378,
+-1.092299, -1.762656, -1.416958, -1.384230,
+-0.991101, -1.418866, -2.759871, -1.128420,
+-1.222521, -1.578551, -1.382878, -1.392779,
+-1.512551, -1.622017, -1.333280, -1.144002,
+-1.103343, -1.737175, -1.407523, -1.396449,
+-1.021297, -1.383406, -2.829530, -1.108333,
+-1.164309, -1.593690, -1.386855, -1.448905,
+-1.490179, -1.615717, -1.355478, -1.145404,
+-1.100528, -1.728022, -1.398047, -1.416381,
+-1.004054, -1.417066, -2.865305, -1.095959,
+-1.197338, -1.557674, -1.399188, -1.424702,
+-1.480965, -1.593994, -1.354883, -1.166383,
+-1.080517, -1.755001, -1.424867, -1.397831,
+-1.015071, -1.396198, -2.863323, -1.099589,
+-1.157040, -1.589954, -1.416250, -1.431176,
+-1.513473, -1.612226, -1.360713, -1.127214,
+-1.107955, -1.744252, -1.400473, -1.392271,
+-0.997921, -1.409712, -2.866910, -1.107854,
+-1.201282, -1.543529, -1.429959, -1.401420,
+-1.484209, -1.606771, -1.368944, -1.144420,
+-1.080199, -1.763655, -1.423002, -1.394062,
+-0.987412, -1.446748, -2.876851, -1.091197,
+-1.188613, -1.579726, -1.386390, -1.429833,
+-1.479287, -1.607892, -1.356032, -1.157696,
+-1.119333, -1.754193, -1.412146, -1.359318,
+-1.003084, -1.446069, -2.935178, -1.065261,
+-1.189625, -1.616130, -1.393082, -1.391549,
+-1.497491, -1.617884, -1.356836, -1.137819,
+-1.086190, -1.752897, -1.395915, -1.420379,
+-0.991829, -1.433753, -2.842751, -1.101342,
+-1.189068, -1.592662, -1.385344, -1.419338,
+-1.492075, -1.584301, -1.374462, -1.148766,
+-1.108117, -1.755378, -1.366134, -1.419286,
+-1.019912, -1.388676, -2.823840, -1.106874,
+-1.177407, -1.587164, -1.420713, -1.402924,
+-1.490007, -1.602995, -1.381971, -1.132418,
+-1.084284, -1.741287, -1.456347, -1.372488,
+-1.006540, -1.438776, -2.782787, -1.092327,
+-1.182775, -1.610384, -1.410401, -1.387429,
+-1.508961, -1.583511, -1.352623, -1.155036,
+-1.079251, -1.749611, -1.416770, -1.411356,
+-1.005714, -1.437290, -2.877651, -1.077695,
+-1.205867, -1.590660, -1.408968, -1.376814,
+-1.472634, -1.641902, -1.380314, -1.122183,
+-1.089408, -1.733487, -1.441638, -1.384787,
+-0.995813, -1.454324, -2.843641, -1.082329,
+-1.202403, -1.622215, -1.373014, -1.391405,
+-1.497100, -1.586819, -1.381384, -1.138135,
+-1.092886, -1.740050, -1.395978, -1.420264,
+-0.992713, -1.435504, -2.820122, -1.103115,
+-1.218408, -1.586061, -1.371750, -1.402804,
+-1.494981, -1.585067, -1.375374, -1.145490,
+-1.082524, -1.785688, -1.424208, -1.374842,
+-1.010960, -1.425555, -2.893535, -1.077707,
+-1.206546, -1.578786, -1.387988, -1.406624,
+-1.450446, -1.638883, -1.384456, -1.136696,
+-1.106158, -1.739669, -1.404900, -1.393511,
+-0.969419, -1.465799, -2.872858, -1.098814,
+-1.173950, -1.611365, -1.372720, -1.436221,
+-1.489675, -1.586248, -1.352916, -1.166753,
+-1.079295, -1.749216, -1.419433, -1.408934,
+-1.025706, -1.448621, -2.761051, -1.069042,
+-1.213167, -1.568222, -1.391085, -1.404355,
+-1.474529, -1.628472, -1.362878, -1.142686,
+-1.112156, -1.744808, -1.391385, -1.395369,
+-0.986754, -1.412570, -2.837427, -1.123553,
+-1.206402, -1.536012, -1.397607, -1.434215,
+-1.500579, -1.571887, -1.383285, -1.143842,
+-1.119211, -1.746448, -1.414993, -1.362020,
+-1.035625, -1.423103, -2.804184, -1.068642,
+-1.195478, -1.585555, -1.394862, -1.407562,
+-1.484095, -1.608443, -1.385087, -1.130749,
+-1.084609, -1.764792, -1.447404, -1.364283,
+-1.014454, -1.437426, -2.839277, -1.074720,
+-1.183079, -1.590897, -1.400600, -1.412745,
+-1.509046, -1.621280, -1.345687, -1.136710,
+-1.073018, -1.727659, -1.469052, -1.385567,
+-0.992790, -1.452865, -2.919631, -1.074099,
+-1.183375, -1.596519, -1.418848, -1.389646,
+-1.488610, -1.613039, -1.376595, -1.131354,
+-1.111813, -1.772183, -1.389889, -1.378431,
+-0.978005, -1.449443, -2.820122, -1.109755,
+-1.168341, -1.598226, -1.388257, -1.438207,
+-1.496772, -1.623791, -1.383255, -1.114018,
+-1.113110, -1.718779, -1.432014, -1.372977,
+-1.005968, -1.405631, -2.846268, -1.105526,
+-1.168550, -1.573256, -1.403542, -1.443547,
+-1.482778, -1.605955, -1.394000, -1.126349,
+-1.069216, -1.788267, -1.437410, -1.378602,
+-1.000628, -1.434713, -2.841763, -1.091106,
+-1.201371, -1.625782, -1.383885, -1.378892,
+-1.476222, -1.621212, -1.382028, -1.130807,
+-1.080710, -1.760117, -1.425596, -1.393296,
+-0.998660, -1.419294, -2.893939, -1.095463,
+-1.189364, -1.612721, -1.396954, -1.390738,
+-1.498562, -1.633154, -1.363871, -1.122187,
+-1.105207, -1.756754, -1.417132, -1.371031,
+-1.029781, -1.409988, -2.903988, -1.067217,
+-1.191096, -1.606963, -1.404178, -1.386104,
+-1.504216, -1.596247, -1.382963, -1.126029,
+-1.069367, -1.757270, -1.451256, -1.386361,
+-0.969500, -1.432663, -2.910444, -1.115952,
+-1.174583, -1.592871, -1.414729, -1.407642,
+-1.487322, -1.640764, -1.352461, -1.134484,
+-1.080790, -1.772033, -1.401940, -1.408267,
+-0.996846, -1.402447, -2.980034, -1.096096,
+-1.202589, -1.590676, -1.405514, -1.384074,
+-1.492060, -1.607198, -1.390227, -1.122013,
+-1.093764, -1.746183, -1.447642, -1.364595,
+-0.986490, -1.436341, -2.861504, -1.102205,
+-1.204614, -1.577734, -1.391693, -1.406105,
+-1.495258, -1.588796, -1.371615, -1.145890,
+-1.088841, -1.734206, -1.436496, -1.389925,
+-0.995666, -1.427561, -2.823377, -1.104958,
+-1.199838, -1.571801, -1.410977, -1.397633,
+-1.511537, -1.603247, -1.365508, -1.130300,
+-1.097292, -1.753322, -1.421877, -1.379249,
+-1.007028, -1.420077, -2.814561, -1.099401,
+-1.191675, -1.599676, -1.416958, -1.378821,
+-1.517077, -1.587958, -1.357937, -1.142204,
+-1.079666, -1.729765, -1.444051, -1.398388,
+-1.008808, -1.401458, -2.825029, -1.109263,
+-1.183541, -1.602676, -1.386294, -1.416959,
+-1.502306, -1.612852, -1.368579, -1.128257,
+-1.085608, -1.731580, -1.453419, -1.380196,
+-1.005278, -1.414121, -2.867149, -1.096443,
+-1.175017, -1.617817, -1.400493, -1.400882,
+-1.487917, -1.637930, -1.387419, -1.108492,
+-1.121119, -1.751412, -1.414239, -1.356946,
+-1.011526, -1.404681, -2.827524, -1.103429,
+-1.212066, -1.614423, -1.383179, -1.375819,
+-1.512540, -1.600635, -1.371552, -1.126489,
+-1.088032, -1.719191, -1.445506, -1.393185,
+-1.019556, -1.400402, -2.808290, -1.101298,
+-1.202394, -1.592986, -1.364357, -1.424356,
+-1.475416, -1.621276, -1.380462, -1.132559,
+-1.103914, -1.743120, -1.447349, -1.353778,
+-0.986356, -1.435104, -2.844398, -1.106220,
+-1.210721, -1.575776, -1.417867, -1.374601,
+-1.518314, -1.589986, -1.375063, -1.126497,
+-1.111996, -1.743615, -1.395376, -1.392429,
+-1.015902, -1.422867, -2.888794, -1.075114,
+-1.222221, -1.592663, -1.382219, -1.382219,
+-1.495377, -1.618006, -1.359279, -1.137261,
+-1.090528, -1.729093, -1.441600, -1.386429,
+-1.025052, -1.437336, -2.809900, -1.068710,
+-1.202026, -1.561214, -1.394436, -1.420632,
+-1.483379, -1.592273, -1.397997, -1.131416,
+-1.085173, -1.716288, -1.436478, -1.407881,
+-1.018268, -1.420209, -2.788804, -1.091770,
+-1.214908, -1.599707, -1.370256, -1.397295,
+-1.509196, -1.610272, -1.400072, -1.101116,
+-1.075000, -1.751899, -1.460100, -1.374188,
+-1.013008, -1.401503, -2.867840, -1.097134,
+-1.220411, -1.570351, -1.385717, -1.399280,
+-1.480249, -1.631263, -1.392959, -1.113531,
+-1.093547, -1.752635, -1.438758, -1.368709,
+-1.009720, -1.442346, -2.847688, -1.074911,
+-1.203518, -1.590309, -1.396885, -1.391772,
+-1.506910, -1.619840, -1.374975, -1.115870,
+-1.093045, -1.766362, -1.396785, -1.400553,
+-0.991178, -1.453055, -2.863656, -1.084868,
+-1.225684, -1.565904, -1.391993, -1.390444,
+-1.472658, -1.620457, -1.381920, -1.133885,
+-1.111509, -1.780463, -1.427281, -1.337819,
+-1.008894, -1.430317, -2.788068, -1.094805,
+-1.214274, -1.581644, -1.412639, -1.370665,
+-1.490178, -1.618602, -1.381363, -1.123143,
+-1.093618, -1.761577, -1.448472, -1.353653,
+-1.015050, -1.394369, -2.919128, -1.091700,
+-1.231212, -1.568539, -1.409596, -1.364649,
+-1.491935, -1.627550, -1.379019, -1.118323,
+-1.091818, -1.756495, -1.438218, -1.368868,
+-0.991826, -1.438063, -2.873564, -1.092978,
+-1.230992, -1.583161, -1.408599, -1.354072,
+-1.528802, -1.574514, -1.394996, -1.113942,
+-1.098813, -1.738982, -1.408348, -1.400445,
+-1.013380, -1.372857, -2.924011, -1.108935,
+-1.213430, -1.593020, -1.392742, -1.381825,
+-1.508419, -1.638652, -1.366635, -1.110072,
+-1.091017, -1.742816, -1.461844, -1.357551,
+-0.997190, -1.418709, -2.911225, -1.094670,
+-1.221318, -1.599664, -1.378222, -1.381633,
+-1.486327, -1.594584, -1.383942, -1.138759,
+-1.076303, -1.755954, -1.436422, -1.391801,
+-0.993697, -1.447438, -2.855403, -1.087406,
+-1.221016, -1.574256, -1.389901, -1.391074,
+-1.498641, -1.629441, -1.367970, -1.121153,
+-1.075410, -1.758174, -1.433135, -1.394637,
+-1.002481, -1.404941, -2.883767, -1.103443,
+-1.242030, -1.584432, -1.404295, -1.344774,
+-1.498074, -1.630279, -1.382150, -1.110098,
+-1.057530, -1.812951, -1.432606, -1.382982,
+-0.995229, -1.455031, -2.806057, -1.089083,
+-1.223332, -1.581554, -1.389968, -1.382249,
+-1.504249, -1.606143, -1.394442, -1.111137,
+-1.081093, -1.756007, -1.485396, -1.340826,
+-1.009673, -1.397732, -2.922146, -1.094559,
+-1.194712, -1.576307, -1.400837, -1.410256,
+-1.491029, -1.657111, -1.362780, -1.114066,
+-1.092304, -1.744072, -1.440955, -1.374174,
+-1.007079, -1.400416, -2.900640, -1.098913,
+-1.234793, -1.549192, -1.403799, -1.382208,
+-1.477759, -1.625122, -1.384035, -1.125783,
+-1.082985, -1.726922, -1.461630, -1.379407,
+-1.005128, -1.425216, -2.868765, -1.088335,
+-1.202197, -1.576070, -1.386585, -1.415710,
+-1.484746, -1.605507, -1.362602, -1.149931,
+-1.094802, -1.772336, -1.406639, -1.384322,
+-1.007059, -1.438318, -2.875229, -1.075936,
+-1.191076, -1.560781, -1.436729, -1.392595,
+-1.501040, -1.618783, -1.386714, -1.111494};
+
+double score_in_acc[NUM_VALUES_SCORES] = {
+-1.298283, 0.000100, 0.000100, 0.000100,
+-1.622017, 0.000100, 0.000100, 0.000100,
+-1.458005, 0.000100, 0.000100, 0.000100,
+-1.214697, 0.000100, 0.000100, 0.000100,
+-1.145865, -1.703467, -1.369585, -1.403288,
+-1.117524, -1.324925, -2.653849, -1.088537,
+-1.348761, -1.632038, -1.240590, -1.363774,
+-1.588764, -1.711509, -1.299845, -1.071031,
+-1.141364, -1.769972, -1.417617, -1.316812,
+-1.161662, -1.357406, -2.658621, -1.022590,
+-1.405044, -1.571298, -1.270193, -1.323918,
+-1.525853, -1.719786, -1.286922, -1.116754,
+-1.172124, -1.799542, -1.339070, -1.336257,
+-1.089115, -1.356116, -2.768078, -1.069898,
+-1.428221, -1.591932, -1.246820, -1.311871,
+-1.512183, -1.698999, -1.303796, -1.123297,
+-1.118658, -1.866122, -1.370615, -1.329504,
+-1.158005, -1.318347, -2.745148, -1.038256,
+-1.294532, -1.622510, -1.297697, -1.364880,
+-1.585332, -1.682929, -1.295613, -1.091950,
+-1.175124, -1.762440, -1.361920, -1.333986,
+-1.148780, -1.361130, -2.802596, -1.005250,
+-1.343147, -1.612972, -1.314835, -1.305031,
+-1.574873, -1.677744, -1.282395, -1.112309,
+-1.204841, -1.785457, -1.348706, -1.298416,
+-1.090608, -1.367881, -2.619658, -1.089115,
+-1.350808, -1.562117, -1.381899, -1.272337,
+-1.542852, -1.632656, -1.290686, -1.152843,
+-1.132323, -1.808473, -1.330078, -1.388876,
+-1.126586, -1.356210, -2.695810, -1.047713,
+-1.406026, -1.599277, -1.307756, -1.264375,
+-1.464895, -1.776886, -1.285941, -1.129041,
+-1.172914, -1.800796, -1.339545, -1.334066,
+-1.116904, -1.383708, -2.902354, -1.001938,
+-1.385423, -1.602487, -1.321320, -1.267253,
+-1.560902, -1.669979, -1.324179, -1.091296,
+-1.142221, -1.740058, -1.335298, -1.417576,
+-1.097597, -1.407337, -2.857893, -1.010205,
+-1.368180, -1.582172, -1.316975, -1.302111,
+-1.529619, -1.665285, -1.314267, -1.122166,
+-1.173501, -1.738112, -1.422474, -1.295543,
+-1.121358, -1.370921, -2.659760, -1.048895,
+-1.331900, -1.661338, -1.257432, -1.340339,
+-1.585292, -1.706759, -1.297611, -1.077412,
+-1.141557, -1.846318, -1.398164, -1.288039,
+-1.080940, -1.453179, -2.684389, -1.025451,
+-1.385411, -1.590607, -1.216727, -1.387179,
+-1.575101, -1.658963, -1.290183, -1.116399,
+-1.129464, -1.839745, -1.368877, -1.333538,
+-1.202765, -1.361236, -2.575360, -1.001938,
+-1.391921, -1.710158, -1.233316, -1.275682,
+-1.567292, -1.629505, -1.347328, -1.092341,
+-1.141873, -1.749575, -1.412487, -1.334102,
+-1.118394, -1.402927, -2.750161, -1.012246,
+-1.376783, -1.661600, -1.269187, -1.284764,
+-1.534313, -1.688213, -1.356004, -1.073359,
+-1.112985, -1.749840, -1.426994, -1.356377,
+-1.135721, -1.342624, -2.804386, -1.029722,
+-1.425389, -1.581417, -1.266159, -1.301993,
+-1.582710, -1.669579, -1.322885, -1.079132,
+-1.174950, -1.805213, -1.353433, -1.315373,
+-1.149802, -1.343735, -2.737633, -1.028040,
+-1.417797, -1.607193, -1.252314, -1.303938,
+-1.500160, -1.724404, -1.306176, -1.115436,
+-1.127993, -1.761055, -1.385552, -1.369364,
+-1.142369, -1.359143, -2.584507, -1.053762,
+-1.415975, -1.592558, -1.262853, -1.305413,
+-1.565542, -1.692378, -1.338019, -1.065432,
+-1.188735, -1.806730, -1.337009, -1.314631,
+-1.157453, -1.403994, -2.715597, -0.983674,
+-1.296644, -1.671192, -1.264946, -1.361505,
+-1.588939, -1.756872, -1.258698, -1.081017,
+-1.159958, -1.725224, -1.407720, -1.333069,
+-1.113047, -1.418660, -2.643312, -1.026440,
+-1.391427, -1.672839, -1.259827, -1.273453,
+-1.481151, -1.720110, -1.286072, -1.148149,
+-1.106656, -1.799803, -1.419618, -1.338649,
+-1.116414, -1.447828, -2.688522, -0.995642,
+-1.404210, -1.604685, -1.262993, -1.306818,
+-1.610432, -1.718930, -1.278948, -1.071160,
+-1.153233, -1.819835, -1.377840, -1.308554,
+-1.149025, -1.325424, -2.743720, -1.041193,
+-1.336838, -1.578454, -1.288128, -1.365532,
+-1.557740, -1.753983, -1.268907, -1.093128,
+-1.152372, -1.830464, -1.409035, -1.275127,
+-1.174060, -1.333467, -2.773612, -1.008152,
+-1.394131, -1.579955, -1.295964, -1.300730,
+-1.530692, -1.763423, -1.259558, -1.113590,
+-1.175127, -1.771869, -1.404313, -1.288552,
+-1.141172, -1.478933, -2.630860, -0.965853,
+-1.427116, -1.589851, -1.222458, -1.341174,
+-1.563006, -1.747181, -1.297560, -1.069930,
+-1.152995, -1.805080, -1.391221, -1.305351,
+-1.116640, -1.492064, -2.739002, -0.959513,
+-1.346907, -1.594432, -1.350235, -1.281020,
+-1.572513, -1.724873, -1.318812, -1.058926,
+-1.184853, -1.739655, -1.399289, -1.302485,
+-1.148460, -1.297347, -2.749599, -1.062282,
+-1.398987, -1.613953, -1.270716, -1.296734,
+-1.559404, -1.733972, -1.312591, -1.067011,
+-1.148216, -1.886465, -1.389703, -1.265701,
+-1.205310, -1.360576, -2.762618, -0.965582,
+-1.377515, -1.615186, -1.231388, -1.358467,
+-1.479472, -1.735464, -1.413514, -1.042317,
+-1.128164, -1.778941, -1.433464, -1.312766,
+-1.205196, -1.391020, -2.734053, -0.950391,
+-1.378593, -1.558031, -1.280661, -1.348369,
+-1.582745, -1.720441, -1.299283, -1.070389,
+-1.158720, -1.753726, -1.368760, -1.352619,
+-1.132777, -1.392932, -2.692898, -1.016575,
+-1.346629, -1.729280, -1.232080, -1.306423,
+-1.579218, -1.722319, -1.292700, -1.076800,
+-1.153483, -1.770845, -1.438427, -1.283830,
+-1.100713, -1.381062, -2.716480, -1.049972,
+-1.453936, -1.639578, -1.223087, -1.280246,
+-1.584690, -1.697130, -1.324770, -1.061559,
+-1.161497, -1.726234, -1.408357, -1.329972,
+-1.122985, -1.352051, -2.756328, -1.042821,
+-1.366639, -1.634903, -1.260068, -1.322639,
+-1.581961, -1.657988, -1.328745, -1.081469,
+-1.206581, -1.722394, -1.346530, -1.339402,
+-1.201495, -1.350707, -2.756344, -0.976373,
+-1.415229, -1.649907, -1.265545, -1.262390,
+-1.559619, -1.710659, -1.300375, -1.088837,
+-1.151589, -1.757319, -1.406914, -1.322526,
+-1.133470, -1.392865, -2.718176, -1.011349,
+-1.343589, -1.630157, -1.328651, -1.278842,
+-1.538740, -1.782208, -1.329705, -1.042628,
+-1.158253, -1.806897, -1.366341, -1.321389,
+-1.179569, -1.359516, -2.602168, -1.016965,
+-1.442838, -1.552482, -1.210506, -1.370518,
+-1.626707, -1.651198, -1.304156, -1.078213,
+-1.216685, -1.744185, -1.481141, -1.198574,
+-1.160668, -1.337291, -2.852797, -1.003800,
+-1.365241, -1.611568, -1.286738, -1.313281,
+-1.586128, -1.712145, -1.378789, -1.013568,
+-1.177988, -1.755419, -1.395878, -1.303233,
+-1.200645, -1.307483, -2.563950, -1.045546,
+-1.443508, -1.624633, -1.253234, -1.268197,
+-1.638091, -1.683333, -1.310915, -1.048902,
+-1.160245, -1.718607, -1.506045, -1.253550,
+-1.220805, -1.345968, -2.682112, -0.977365,
+-1.391116, -1.634677, -1.253419, -1.306891,
+-1.631241, -1.698267, -1.323899, -1.035026,
+-1.189085, -1.730086, -1.447889, -1.261725,
+-1.187804, -1.324379, -2.675646, -1.020875,
+-1.478874, -1.488924, -1.323581, -1.271859,
+-1.607579, -1.696562, -1.308736, -1.060872,
+-1.139975, -1.758488, -1.457200, -1.291056,
+-1.198848, -1.310261, -2.876024, -0.987919,
+-1.445783, -1.571075, -1.322438, -1.237420,
+-1.671989, -1.700829, -1.310975, -1.021547,
+-1.146307, -1.770461, -1.496668, -1.244577,
+-1.121967, -1.402136, -2.635135, -1.031192,
+-1.460009, -1.572059, -1.361069, -1.191510,
+-1.555920, -1.681375, -1.316319, -1.094322,
+-1.177529, -1.710966, -1.473374, -1.265427,
+-1.190159, -1.333896, -2.675406, -1.011978,
+-1.492018, -1.489945, -1.329603, -1.254865,
+-1.494918, -1.662604, -1.391925, -1.086213,
+-1.149385, -1.802479, -1.488023, -1.229427,
+-1.269360, -1.295157, -2.385096, -1.041136,
+-1.474900, -1.605184, -1.348642, -1.168756,
+-1.656966, -1.690639, -1.280618, -1.058384,
+-1.152680, -1.626732, -1.490415, -1.338036,
+-1.164243, -1.307589, -2.748562, -1.040297,
+-1.360435, -1.584459, -1.367502, -1.260016,
+-1.688485, -1.672037, -1.353435, -0.996998,
+-1.193228, -1.680168, -1.476678, -1.265772,
+-1.212720, -1.283884, -2.831789, -1.003086,
+-1.396340, -1.521119, -1.404062, -1.243289,
+-1.633779, -1.533391, -1.383176, -1.084041,
+-1.131207, -1.652185, -1.575601, -1.277109,
+-1.206684, -1.311542, -2.883177, -0.979627,
+-1.381881, -1.599658, -1.377975, -1.221321,
+-1.559634, -1.663175, -1.321991, -1.097673,
+-1.161205, -1.697471, -1.547712, -1.234393,
+-1.172611, -1.307240, -2.799089, -1.024402,
+-1.475968, -1.552583, -1.380856, -1.176426,
+-1.615688, -1.678765, -1.292969, -1.078260,
+-1.206296, -1.690351, -1.491397, -1.233802,
+-1.212325, -1.350475, -2.829859, -0.956184,
+-1.429438, -1.512994, -1.382463, -1.240127,
+-1.703363, -1.591425, -1.310955, -1.064988,
+-1.151942, -1.739983, -1.527910, -1.232834,
+-1.298971, -1.297257, -2.761883, -0.939727,
+-1.414751, -1.483604, -1.462856, -1.208580,
+-1.674384, -1.634474, -1.364554, -1.016037,
+-1.122974, -1.690016, -1.544599, -1.284551,
+-1.235359, -1.303662, -2.799155, -0.975848,
+-1.426116, -1.434484, -1.453573, -1.245532,
+-1.631318, -1.620768, -1.366625, -1.045216,
+-1.165331, -1.633839, -1.562724, -1.260765,
+-1.217656, -1.301576, -2.934865, -0.970651,
+-1.345412, -1.506342, -1.536264, -1.195130,
+-1.637784, -1.551563, -1.414957, -1.047718,
+-1.174527, -1.624557, -1.620659, -1.216492,
+-1.272260, -1.329985, -2.691130, -0.947996,
+-1.366220, -1.474374, -1.502744, -1.225912,
+-1.584006, -1.633979, -1.355266, -1.073476,
+-1.206041, -1.572185, -1.649744, -1.200879,
+-1.267592, -1.336935, -2.749850, -0.936738,
+-1.309161, -1.514739, -1.528958, -1.226469,
+-1.612766, -1.582913, -1.424151, -1.036834,
+-1.152225, -1.645935, -1.664285, -1.197630,
+-1.294772, -1.359531, -2.745825, -0.903712,
+-1.394431, -1.500034, -1.529520, -1.163907,
+-1.602047, -1.636507, -1.385701, -1.039280,
+-1.167465, -1.657543, -1.703770, -1.151245,
+-1.301788, -1.324456, -2.681209, -0.932605,
+-1.403536, -1.473306, -1.545801, -1.164862,
+-1.551844, -1.565656, -1.468956, -1.052608,
+-1.149824, -1.605157, -1.630724, -1.249453,
+-1.290241, -1.264181, -2.943531, -0.942517,
+-1.304290, -1.505188, -1.605008, -1.184918,
+-1.560259, -1.639609, -1.418269, -1.039222,
+-1.142960, -1.591712, -1.800934, -1.163455,
+-1.256629, -1.382217, -2.845545, -0.900752,
+-1.368236, -1.468548, -1.696499, -1.103040,
+-1.599954, -1.551023, -1.520592, -1.001085,
+-1.123103, -1.510434, -1.846211, -1.217116,
+-1.332298, -1.365316, -2.867628, -0.858030,
+-1.350102, -1.465518, -1.732147, -1.099842,
+-1.576476, -1.622477, -1.491624, -0.991873,
+-1.098226, -1.563375, -1.902803, -1.177786,
+-1.364002, -1.356972, -3.182572, -0.808679,
+-1.283400, -1.578517, -1.747200, -1.071879,
+-1.594149, -1.564256, -1.478361, -1.022585,
+-1.089822, -1.526594, -1.951685, -1.189365,
+-1.292020, -1.351816, -3.183077, -0.855532,
+-1.224284, -1.536010, -1.873073, -1.087249,
+-1.572873, -1.590312, -1.469376, -1.025490,
+-1.140688, -1.477160, -1.993420, -1.152370,
+-1.368373, -1.352513, -3.019521, -0.825388,
+-1.306903, -1.476802, -1.836329, -1.074190,
+-1.608595, -1.565977, -1.488318, -1.007323,
+-1.171335, -1.414913, -2.051619, -1.143902,
+-1.291423, -1.328795, -3.117248, -0.876961,
+-1.262329, -1.538477, -1.925760, -1.031328,
+-1.744068, -1.562269, -1.549265, -0.908500,
+-1.189067, -1.438308, -2.101403, -1.090999,
+-1.403285, -1.363658, -3.037634, -0.797309,
+-1.280513, -1.493606, -2.068970, -0.990939,
+-1.740625, -1.503680, -1.477796, -0.983128,
+-1.209863, -1.505547, -2.069425, -1.039568,
+-1.474199, -1.230376, -2.996159, -0.846586,
+-1.419817, -1.521397, -1.871802, -0.951946,
+-1.802024, -1.527524, -1.514342, -0.921278,
+-1.227731, -1.440212, -2.175919, -1.030966,
+-1.528092, -1.297167, -2.938102, -0.783554,
+-1.463417, -1.532827, -1.880613, -0.915953,
+-2.005152, -1.487168, -1.552308, -0.849591,
+-1.275353, -1.398643, -2.207587, -1.011233,
+-1.666028, -1.256129, -3.130884, -0.728629,
+-1.596293, -1.485792, -1.913030, -0.859442,
+-2.064598, -1.512069, -1.531959, -0.828817,
+-1.424035, -1.384118, -2.168237, -0.930553,
+-1.730066, -1.186225, -2.971241, -0.763321,
+-1.693668, -1.501296, -1.913030, -0.808149,
+-2.219761, -1.492997, -1.580384, -0.774855,
+-1.501896, -1.410925, -2.214846, -0.857539,
+-1.711810, -1.214889, -3.039708, -0.744716,
+-1.892247, -1.423981, -1.964220, -0.758756,
+-2.206199, -1.488013, -1.595464, -0.773823,
+-1.417809, -1.367333, -2.659731, -0.837010,
+-1.915395, -1.184710, -3.073095, -0.691941,
+-2.070218, -1.426285, -1.978510, -0.702456,
+-2.328366, -1.427370, -1.648702, -0.754370,
+-1.414676, -1.470765, -2.909702, -0.749180,
+-2.055749, -1.215381, -3.166519, -0.628754,
+-2.138912, -1.429557, -2.075599, -0.659092,
+-2.351480, -1.377296, -1.696583, -0.756719,
+-1.545985, -1.414307, -3.174114, -0.689209,
+-2.158969, -1.218231, -3.244292, -0.598188,
+-2.360033, -1.374866, -2.107753, -0.632619,
+-2.498192, -1.422895, -1.640368, -0.728065,
+-1.572480, -1.467562, -3.496505, -0.631713,
+-2.098699, -1.160430, -3.206880, -0.647123,
+-2.223828, -1.477001, -2.209229, -0.591134,
+-2.623254, -1.423354, -1.709677, -0.682000,
+-1.654411, -1.480458, -3.908202, -0.577704,
+-2.366086, -1.163774, -3.307587, -0.584754,
+-2.532024, -1.372169, -2.332311, -0.562339,
+-2.568229, -1.418198, -1.795426, -0.663351,
+-1.770572, -1.487618, -4.441573, -0.524119,
+-2.399435, -1.207826, -3.559201, -0.541417,
+-2.614654, -1.537096, -2.121484, -0.524335,
+-2.722673, -1.399220, -1.826412, -0.641463,
+-1.749200, -1.357721, -4.321109, -0.587787,
+-2.401038, -1.181602, -3.566789, -0.554529,
+-2.232051, -1.496704, -2.198150, -0.583725,
+-2.594245, -1.367348, -1.836785, -0.671019,
+-1.672136, -1.365517, -5.421616, -0.593327,
+-2.215511, -1.177523, -3.662429, -0.584847,
+-2.479396, -1.445079, -2.178072, -0.567010,
+-2.504500, -1.284897, -1.664075, -0.793545,
+-1.789737, -1.323529, -4.816229, -0.582135,
+-2.029729, -1.142265, -3.478014, -0.656498,
+-2.493205, -1.514879, -2.286191, -0.517737,
+-2.438169, -1.180850, -1.805005, -0.818284,
+-1.571802, -1.290899, -5.028555, -0.671860,
+-2.062680, -1.030025, -3.701808, -0.710883,
+-2.230398, -1.303450, -2.186723, -0.676020,
+-2.509336, -1.168315, -1.880561, -0.786836,
+-1.803530, -1.374661, -4.448059, -0.560967,
+-2.353114, -0.874837, -3.901340, -0.759751,
+-2.401061, -1.480130, -2.301531, -0.541861,
+-2.902595, -1.173849, -2.203739, -0.643300,
+-1.440629, -1.268511, -4.158876, -0.762827,
+-2.306540, -0.910387, -3.742844, -0.745832,
+-1.924677, -1.367867, -2.315543, -0.691718,
+-2.796046, -1.628159, -2.518702, -0.412347,
+-0.709182, -1.565863, -2.505260, -1.526056,
+-1.065038, -1.104807, -2.495496, -1.420659,
+-1.176059, -1.723113, -1.148356, -1.630454,
+-1.817762, -1.496771, -1.271445, -1.098612,
+-2.519205, -0.344118, -5.635175, -1.574743,
+-2.977455, -0.464725, -6.435324, -1.142044,
+-3.092476, -0.304870, -5.161439, -1.552784,
+-3.035229, -0.562993, -5.197109, -0.975771,
+-1.185282, 0.000100, 0.000100, 0.000100,
+-1.629335, 0.000100, 0.000100, 0.000100,
+-1.586308, 0.000100, 0.000100, 0.000100,
+-1.225537, 0.000100, 0.000100, 0.000100,
+-1.068323, -1.765095, -1.447197, -1.386229,
+-1.010964, -1.435962, -2.836073, -1.080016,
+-1.227770, -1.546849, -1.422713, -1.374058,
+-1.471573, -1.655439, -1.386840, -1.109985,
+-1.087415, -1.780367, -1.427841, -1.368412,
+-0.984899, -1.408182, -2.870136, -1.123179,
+-1.220772, -1.551013, -1.393404, -1.407577,
+-1.490451, -1.606219, -1.383691, -1.128765,
+-1.086542, -1.768407, -1.447530, -1.359209,
+-0.997638, -1.467427, -2.826601, -1.074352,
+-1.231911, -1.557713, -1.407346, -1.374936,
+-1.491872, -1.630735, -1.368864, -1.124338,
+-1.083233, -1.781284, -1.455884, -1.347517,
+-0.999942, -1.417129, -2.893504, -1.095691,
+-1.210139, -1.598587, -1.394925, -1.379079,
+-1.502643, -1.611902, -1.385679, -1.115360,
+-1.069250, -1.762012, -1.462922, -1.372485,
+-0.999921, -1.456048, -2.920234, -1.064169,
+-1.196587, -1.588488, -1.394583, -1.404029,
+-1.474635, -1.625410, -1.400741, -1.115070,
+-1.095417, -1.758171, -1.433297, -1.367595,
+-0.985825, -1.453125, -2.785193, -1.104691,
+-1.172592, -1.603082, -1.416870, -1.399625,
+-1.486991, -1.645648, -1.367963, -1.119523,
+-1.086440, -1.788556, -1.442117, -1.351097,
+-0.997773, -1.464743, -2.764173, -1.087273,
+-1.191705, -1.612409, -1.389649, -1.395426,
+-1.530923, -1.603852, -1.368676, -1.114436,
+-1.088908, -1.740367, -1.446936, -1.375671,
+-0.962156, -1.454487, -2.965130, -1.100040,
+-1.202579, -1.608677, -1.389725, -1.385153,
+-1.457992, -1.614451, -1.396346, -1.136946,
+-1.070214, -1.740863, -1.452347, -1.395636,
+-1.008765, -1.397822, -2.875205, -1.103248,
+-1.230409, -1.577385, -1.401677, -1.365987,
+-1.472914, -1.619215, -1.386155, -1.131173,
+-1.072389, -1.777718, -1.465439, -1.355533,
+-1.001679, -1.439101, -2.841342, -1.086932,
+-1.253103, -1.547728, -1.382221, -1.383771,
+-1.461387, -1.622813, -1.409659, -1.119275,
+-1.080096, -1.805542, -1.434282, -1.355706,
+-0.993512, -1.443300, -2.814878, -1.097613,
+-1.221483, -1.574932, -1.394665, -1.385216,
+-1.456980, -1.613141, -1.381719, -1.149945,
+-1.081344, -1.751019, -1.426309, -1.398085,
+-0.974742, -1.432800, -2.872219, -1.116276,
+-1.205927, -1.605319, -1.381793, -1.391781,
+-1.490724, -1.637619, -1.372736, -1.117994,
+-1.095671, -1.751723, -1.434574, -1.370448,
+-1.018717, -1.460333, -2.790822, -1.063016,
+-1.192923, -1.581267, -1.417638, -1.391642,
+-1.478188, -1.630044, -1.372491, -1.131486,
+-1.094195, -1.764900, -1.437956, -1.360336,
+-0.986257, -1.443898, -2.886446, -1.092880,
+-1.226947, -1.580121, -1.399821, -1.369545,
+-1.464939, -1.632832, -1.411593, -1.109336,
+-1.086497, -1.732911, -1.434755, -1.395694,
+-0.994503, -1.449570, -2.856268, -1.084891,
+-1.213900, -1.591372, -1.406183, -1.369474,
+-1.510584, -1.617526, -1.368700, -1.119654,
+-1.071576, -1.783697, -1.463664, -1.354297,
+-1.017993, -1.410677, -2.800665, -1.096815,
+-1.203909, -1.579766, -1.374661, -1.422825,
+-1.514059, -1.598054, -1.357135, -1.138504,
+-1.080167, -1.761482, -1.450171, -1.369867,
+-0.987816, -1.435841, -2.898561, -1.094840,
+-1.206588, -1.553140, -1.432450, -1.384385,
+-1.467937, -1.630367, -1.375785, -1.136000,
+-1.078253, -1.783372, -1.435202, -1.371710,
+-1.020853, -1.432572, -2.871679, -1.065909,
+-1.203028, -1.592183, -1.397207, -1.390508,
+-1.446590, -1.616457, -1.384979, -1.152968,
+-1.074445, -1.808446, -1.453168, -1.344095,
+-0.998673, -1.413492, -2.813142, -1.113756,
+-1.201433, -1.582950, -1.390649, -1.406680,
+-1.461042, -1.610131, -1.401928, -1.133135,
+-1.079562, -1.753512, -1.474717, -1.353837,
+-0.996560, -1.394687, -2.849046, -1.123798,
+-1.207408, -1.572000, -1.436658, -1.363835,
+-1.496758, -1.647149, -1.364427, -1.114681,
+-1.096113, -1.785524, -1.405910, -1.374447,
+-1.012822, -1.408867, -2.820343, -1.100208,
+-1.239593, -1.591115, -1.346494, -1.399789,
+-1.488028, -1.619519, -1.375906, -1.128315,
+-1.062328, -1.749116, -1.470891, -1.383492,
+-0.996609, -1.429360, -2.864612, -1.095406,
+-1.192497, -1.573677, -1.383390, -1.433266,
+-1.489476, -1.578559, -1.389978, -1.142074,
+-1.095617, -1.747685, -1.441456, -1.366856,
+-1.017171, -1.426723, -2.874666, -1.073371,
+-1.197630, -1.599553, -1.398839, -1.389416,
+-1.493797, -1.613382, -1.392075, -1.115627,
+-1.077575, -1.751836, -1.432862, -1.396345,
+-0.969419, -1.480507, -2.856330, -1.091552,
+-1.184660, -1.596671, -1.388731, -1.418040,
+-1.479430, -1.649741, -1.371548, -1.119574,
+-1.088476, -1.748081, -1.439194, -1.378151,
+-0.989244, -1.467803, -2.817096, -1.084911,
+-1.209765, -1.587848, -1.384668, -1.398525,
+-1.484660, -1.612368, -1.406629, -1.111582,
+-1.069751, -1.744277, -1.462069, -1.384782,
+-0.997336, -1.407862, -2.914204, -1.101939,
+-1.198716, -1.559326, -1.412473, -1.408128,
+-1.472936, -1.662203, -1.368991, -1.118822,
+-1.125386, -1.745862, -1.405659, -1.363465,
+-1.003919, -1.460663, -2.845668, -1.068912,
+-1.227344, -1.576786, -1.416130, -1.356189,
+-1.494146, -1.678019, -1.357637, -1.103982,
+-1.097068, -1.734444, -1.466561, -1.351175,
+-0.992039, -1.447684, -2.908687, -1.080226,
+-1.206476, -1.577944, -1.404401, -1.390957,
+-1.462419, -1.660108, -1.385144, -1.114989,
+-1.096901, -1.726793, -1.433294, -1.387415,
+-0.985276, -1.457335, -2.849275, -1.090877,
+-1.195846, -1.587540, -1.410480, -1.389910,
+-1.498122, -1.585828, -1.361086, -1.154262,
+-1.100595, -1.718642, -1.442999, -1.379124,
+-0.987017, -1.405579, -2.920000, -1.114271,
+-1.180930, -1.579562, -1.411188, -1.414331,
+-1.498636, -1.601425, -1.373160, -1.134288,
+-1.085937, -1.762471, -1.417822, -1.392098,
+-0.987958, -1.430462, -2.864892, -1.104200,
+-1.170190, -1.591433, -1.411181, -1.417977,
+-1.457221, -1.651592, -1.400481, -1.112017,
+-1.083080, -1.755803, -1.415168, -1.403255,
+-0.991776, -1.405179, -2.869457, -1.117703,
+-1.244684, -1.574032, -1.400185, -1.353987,
+-1.484967, -1.641068, -1.374730, -1.118379,
+-1.093401, -1.765453, -1.399008, -1.398472,
+-1.006048, -1.391903, -2.846789, -1.115636,
+-1.214680, -1.591011, -1.398288, -1.376519,
+-1.474014, -1.611108, -1.391033, -1.131618,
+-1.089404, -1.749134, -1.425169, -1.389538,
+-0.991063, -1.417723, -2.835198, -1.115211,
+-1.195297, -1.591661, -1.418677, -1.379259,
+-1.488467, -1.610557, -1.372475, -1.136232,
+-1.074679, -1.744452, -1.459403, -1.380404,
+-0.979969, -1.409815, -2.853890, -1.130508,
+-1.187401, -1.572414, -1.439346, -1.384852,
+-1.488576, -1.623657, -1.370690, -1.129489,
+-1.100565, -1.720610, -1.436295, -1.384086,
+-0.989515, -1.405693, -2.941571, -1.107865,
+-1.193296, -1.624427, -1.372395, -1.401383,
+-1.499162, -1.611677, -1.375934, -1.125388,
+-1.113428, -1.740532, -1.424953, -1.364055,
+-1.001107, -1.444199, -2.838078, -1.084548,
+-1.233987, -1.542371, -1.392664, -1.400050,
+-1.467197, -1.624325, -1.371282, -1.143809,
+-1.116950, -1.756062, -1.418765, -1.354845,
+-0.978586, -1.452234, -2.929783, -1.088556,
+-1.194238, -1.585914, -1.421809, -1.382192,
+-1.483117, -1.618702, -1.369178, -1.137564,
+-1.070821, -1.745083, -1.442211, -1.401456,
+-1.030765, -1.412444, -2.843003, -1.074500,
+-1.210780, -1.594312, -1.382732, -1.393944,
+-1.498071, -1.601824, -1.362079, -1.143246,
+-1.101202, -1.782059, -1.417887, -1.358632,
+-1.006935, -1.443214, -2.810413, -1.083777,
+-1.236254, -1.587548, -1.372181, -1.380113,
+-1.510728, -1.629736, -1.358294, -1.120335,
+-1.101096, -1.741419, -1.432259, -1.372576,
+-0.938922, -1.466371, -2.915596, -1.126957,
+-1.204036, -1.561474, -1.404951, -1.407259,
+-1.498785, -1.578828, -1.365024, -1.155159,
+-1.100448, -1.760863, -1.415070, -1.376451,
+-1.003930, -1.435045, -2.800009, -1.094671,
+-1.227060, -1.557857, -1.395543, -1.392065,
+-1.457093, -1.656519, -1.384392, -1.121443,
+-1.090723, -1.718840, -1.432632, -1.402129,
+-1.000990, -1.422882, -2.849650, -1.097808,
+-1.169944, -1.604002, -1.369417, -1.451279,
+-1.468368, -1.614640, -1.373850, -1.146942,
+-1.086284, -1.767317, -1.426594, -1.379852,
+-1.008632, -1.426126, -2.756270, -1.104078,
+-1.193780, -1.579812, -1.417316, -1.392116,
+-1.449603, -1.629447, -1.367723, -1.156414,
+-1.094903, -1.758106, -1.415869, -1.384926,
+-1.019079, -1.408412, -2.806677, -1.096207,
+-1.212910, -1.592647, -1.378006, -1.397550,
+-1.519645, -1.620523, -1.346242, -1.129487,
+-1.093277, -1.757898, -1.409089, -1.393880,
+-1.020721, -1.430347, -2.799278, -1.080031,
+-1.187791, -1.616589, -1.398558, -1.387983,
+-1.477309, -1.621138, -1.356175, -1.150650,
+-1.079913, -1.787939, -1.405754, -1.394778,
+-0.978014, -1.436253, -2.820010, -1.119263,
+-1.205578, -1.619586, -1.409686, -1.353709,
+-1.514185, -1.631254, -1.367408, -1.109990,
+-1.091992, -1.771125, -1.416811, -1.379021,
+-1.005039, -1.422991, -2.801270, -1.101884,
+-1.209013, -1.598402, -1.405737, -1.370019,
+-1.496393, -1.654361, -1.332362, -1.136326,
+-1.111231, -1.786994, -1.391350, -1.367887,
+-0.985037, -1.437139, -2.789389, -1.116239,
+-1.227753, -1.583652, -1.365631, -1.399963,
+-1.497745, -1.600315, -1.381792, -1.128850,
+-1.107796, -1.715785, -1.434618, -1.379578,
+-0.987207, -1.439693, -2.846959, -1.101528,
+-1.176950, -1.585149, -1.410613, -1.415221,
+-1.477711, -1.625874, -1.369576, -1.136669,
+-1.105330, -1.743701, -1.411151, -1.385638,
+-0.999267, -1.420223, -2.828953, -1.105287,
+-1.194124, -1.577011, -1.397406, -1.413864,
+-1.488727, -1.631518, -1.379806, -1.117549,
+-1.114402, -1.759949, -1.409633, -1.364134,
+-1.005595, -1.418573, -2.807751, -1.103303,
+-1.196177, -1.582880, -1.395691, -1.408099,
+-1.483558, -1.601802, -1.367107, -1.149502,
+-1.086994, -1.782704, -1.387158, -1.407295,
+-0.998784, -1.430728, -2.930251, -1.081299,
+-1.194510, -1.608775, -1.379212, -1.405525,
+-1.486790, -1.621603, -1.377111, -1.126967,
+-1.096250, -1.773016, -1.393282, -1.395136,
+-0.992432, -1.485132, -2.804879, -1.071952,
+-1.188665, -1.608208, -1.375283, -1.417338,
+-1.511397, -1.626613, -1.334945, -1.140586,
+-1.104778, -1.779344, -1.393791, -1.378984,
+-1.009809, -1.427732, -2.770541, -1.098920,
+-1.193156, -1.567267, -1.413940, -1.406744,
+-1.477026, -1.619703, -1.359516, -1.149037,
+-1.106785, -1.753777, -1.422890, -1.365533,
+-0.990630, -1.423033, -2.855709, -1.108177,
+-1.170207, -1.600293, -1.387326, -1.434991,
+-1.460456, -1.618395, -1.383070, -1.143048,
+-1.125408, -1.759008, -1.359626, -1.400363,
+-0.989906, -1.446169, -2.882103, -1.087966,
+-1.195905, -1.576685, -1.415023, -1.394364,
+-1.479064, -1.637438, -1.351836, -1.142856,
+-1.101998, -1.756600, -1.417116, -1.375354,
+-1.013341, -1.409959, -2.810391, -1.100630,
+-1.191760, -1.602000, -1.376543, -1.417288,
+-1.488197, -1.623192, -1.353697, -1.143602,
+-1.107866, -1.754748, -1.411458, -1.374372,
+-1.009154, -1.441162, -2.820299, -1.081071,
+-1.186623, -1.611036, -1.379088, -1.413620,
+-1.470225, -1.619304, -1.388541, -1.131185,
+-1.099338, -1.791891, -1.394642, -1.376970,
+-0.992346, -1.443399, -2.900394, -1.084211,
+-1.176422, -1.587610, -1.414169, -1.410264,
+-1.467101, -1.614450, -1.374834, -1.147198,
+-1.127134, -1.762249, -1.353614, -1.402113,
+-0.993282, -1.411392, -2.980173, -1.093466,
+-1.191545, -1.588426, -1.380787, -1.424533,
+-1.487080, -1.626410, -1.341038, -1.152773,
+-1.082979, -1.785398, -1.407119, -1.390959,
+-0.975375, -1.470581, -2.791575, -1.103124,
+-1.156639, -1.669129, -1.382059, -1.402479,
+-1.472611, -1.612913, -1.404009, -1.121602,
+-1.108330, -1.739751, -1.381791, -1.413880,
+-0.999430, -1.447772, -2.865772, -1.079168,
+-1.196625, -1.583179, -1.386867, -1.416303,
+-1.479805, -1.620810, -1.377331, -1.132184,
+-1.093596, -1.760044, -1.405332, -1.395668,
+-1.016169, -1.421357, -2.823545, -1.086962,
+-1.170519, -1.563237, -1.402911, -1.450483,
+-1.476513, -1.626313, -1.360955, -1.144138,
+-1.115420, -1.786197, -1.388857, -1.365449,
+-1.003248, -1.435264, -2.831224, -1.089692,
+-1.245460, -1.567123, -1.366905, -1.391700,
+-1.509856, -1.617887, -1.350735, -1.134163,
+-1.094713, -1.788263, -1.394818, -1.385352,
+-0.978021, -1.424792, -2.860294, -1.120444,
+-1.157419, -1.625471, -1.416799, -1.400817,
+-1.486921, -1.590718, -1.382908, -1.141608,
+-1.119936, -1.746875, -1.372134, -1.403171,
+-0.989115, -1.409298, -2.807191, -1.128850,
+-1.192997, -1.592504, -1.378028, -1.422136,
+-1.513578, -1.623527, -1.331310, -1.143988,
+-1.079942, -1.778989, -1.407630, -1.398960,
+-0.979436, -1.446844, -2.832715, -1.107715,
+-1.157895, -1.650956, -1.369995, -1.427621,
+-1.455552, -1.632335, -1.374406, -1.144868,
+-1.133511, -1.771919, -1.336068, -1.405619,
+-1.010087, -1.407684, -2.901113, -1.090224,
+-1.196883, -1.583153, -1.356886, -1.447840,
+-1.490490, -1.628256, -1.352837, -1.139558,
+-1.093003, -1.771083, -1.409187, -1.385090,
+-0.976427, -1.416014, -2.877815, -1.125752,
+-1.148944, -1.581737, -1.421165, -1.444014,
+-1.496056, -1.624307, -1.361140, -1.131431,
+-1.112600, -1.790462, -1.360535, -1.394768,
+-1.015007, -1.423373, -2.828524, -1.085894,
+-1.182168, -1.578802, -1.408195, -1.416422,
+-1.481590, -1.641107, -1.348956, -1.141163,
+-1.098083, -1.767501, -1.372948, -1.417281,
+-0.971022, -1.461993, -2.833112, -1.106542,
+-1.140910, -1.607953, -1.400981, -1.453207,
+-1.467110, -1.634745, -1.334187, -1.167851,
+-1.135532, -1.768113, -1.320708, -1.422344,
+-0.994936, -1.451187, -2.793616, -1.094337,
+-1.178470, -1.664466, -1.323399, -1.440834,
+-1.461162, -1.677032, -1.315166, -1.162589,
+-1.079590, -1.764612, -1.391389, -1.425940,
+-0.957182, -1.451390, -2.859752, -1.125472,
+-1.110050, -1.684761, -1.382335, -1.452533,
+-1.458989, -1.660278, -1.336684, -1.156050,
+-1.113595, -1.796199, -1.317462, -1.435984,
+-0.982847, -1.447128, -2.786660, -1.112049,
+-1.155749, -1.650657, -1.341304, -1.462095,
+-1.480705, -1.615331, -1.352770, -1.154628,
+-1.072712, -1.752780, -1.401619, -1.433676,
+-0.982481, -1.385882, -2.812998, -1.153625,
+-1.126271, -1.646859, -1.382823, -1.459998,
+-1.464692, -1.599751, -1.375744, -1.157558,
+-1.163584, -1.710982, -1.311768, -1.437152,
+-1.042536, -1.434227, -2.937676, -1.032397,
+-1.204850, -1.548493, -1.379367, -1.444105,
+-1.479569, -1.601626, -1.391807, -1.132987,
+-1.049473, -1.928282, -1.260908, -1.509215,
+-0.905215, -1.496922, -2.818569, -1.164637,
+-1.076601, -1.832951, -1.279956, -1.508427,
+-1.381168, -1.719323, -1.269913, -1.242466,
+-1.104792, -1.399977, -1.367211, -1.787962,
+-1.091861, -0.939227, -2.731868, -1.568386,
+-1.217584, -1.262440, -1.355713, -1.811986,
+-1.512844, -1.240443, -1.308673, -1.512844};
+
+double score_ex_don[NUM_VALUES_SCORES] = {
+-1.385894, 0.000100, 0.000100, 0.000100,
+-1.414282, 0.000100, 0.000100, 0.000100,
+-1.446894, 0.000100, 0.000100, 0.000100,
+-1.303793, 0.000100, 0.000100, 0.000100,
+-1.335241, -1.584170, -1.183264, -1.489392,
+-1.204384, -1.325258, -2.223904, -1.120235,
+-1.335341, -1.336958, -1.328900, -1.562937,
+-1.993044, -1.394802, -1.147292, -1.209513,
+-1.287013, -1.601473, -1.214976, -1.489040,
+-1.120482, -1.347074, -2.250884, -1.175802,
+-1.307538, -1.408008, -1.272852, -1.585463,
+-1.945910, -1.407572, -1.158650, -1.208998,
+-1.253559, -1.581985, -1.304998, -1.436437,
+-1.221762, -1.289535, -2.199747, -1.142377,
+-1.246984, -1.374375, -1.444723, -1.496909,
+-1.882237, -1.356374, -1.263781, -1.179040,
+-1.341174, -1.537163, -1.162126, -1.557553,
+-1.210456, -1.247727, -2.176045, -1.199674,
+-1.301487, -1.361505, -1.341096, -1.560459,
+-1.951814, -1.327012, -1.141460, -1.296956,
+-1.257347, -1.486356, -1.235654, -1.615694,
+-1.207496, -1.300586, -2.042237, -1.207496,
+-1.264663, -1.392624, -1.287263, -1.643938,
+-1.881247, -1.464472, -1.163471, -1.190728,
+-1.254729, -1.516129, -1.314321, -1.484437,
+-1.198137, -1.356129, -2.109703, -1.141580,
+-1.233054, -1.425260, -1.391083, -1.517100,
+-1.894316, -1.377111, -1.291823, -1.131643,
+-1.400390, -1.583343, -1.149251, -1.463878,
+-1.202461, -1.311185, -2.165140, -1.154194,
+-1.326569, -1.454477, -1.336486, -1.434125,
+-1.932369, -1.335296, -1.158592, -1.279461,
+-1.328337, -1.521580, -1.238268, -1.483553,
+-1.162328, -1.270617, -2.150199, -1.237449,
+-1.227946, -1.401725, -1.325000, -1.634081,
+-1.855396, -1.372851, -1.130313, -1.319409,
+-1.248651, -1.516914, -1.344782, -1.456185,
+-1.244395, -1.261202, -2.150322, -1.164352,
+-1.214949, -1.375174, -1.381513, -1.613039,
+-1.956996, -1.346665, -1.211761, -1.200847,
+-1.329661, -1.511983, -1.215854, -1.520951,
+-1.185875, -1.256712, -2.155158, -1.224012,
+-1.344383, -1.341072, -1.339421, -1.533695,
+-2.061970, -1.385528, -1.118346, -1.218121,
+-1.268418, -1.595679, -1.173108, -1.577217,
+-1.160320, -1.255128, -2.126186, -1.264844,
+-1.272510, -1.441249, -1.281093, -1.582276,
+-1.798000, -1.377006, -1.137056, -1.342181,
+-1.312737, -1.506586, -1.287276, -1.455728,
+-1.225574, -1.285764, -2.168645, -1.153156,
+-1.203062, -1.383565, -1.426531, -1.565886,
+-1.923623, -1.353451, -1.199531, -1.223358,
+-1.288693, -1.569247, -1.233045, -1.492707,
+-1.210494, -1.311561, -2.125450, -1.161004,
+-1.328422, -1.306021, -1.333288, -1.606442,
+-2.041220, -1.348073, -1.066661, -1.323381,
+-1.344774, -1.501587, -1.259226, -1.457784,
+-1.148714, -1.252962, -2.143847, -1.272625,
+-1.223071, -1.469081, -1.231082, -1.695205,
+-1.851393, -1.411185, -1.130847, -1.286022,
+-1.265278, -1.516002, -1.321950, -1.462587,
+-1.211323, -1.293873, -2.083518, -1.192378,
+-1.175999, -1.422399, -1.422399, -1.564370,
+-1.943855, -1.331470, -1.219588, -1.212649,
+-1.328149, -1.576760, -1.181944, -1.506398,
+-1.193201, -1.302095, -2.097657, -1.197177,
+-1.267077, -1.427619, -1.247394, -1.654360,
+-1.975623, -1.357439, -1.081805, -1.327938,
+-1.268149, -1.514720, -1.227038, -1.581731,
+-1.100665, -1.362783, -2.160578, -1.216908,
+-1.240000, -1.405749, -1.267798, -1.692918,
+-1.854267, -1.434190, -1.075299, -1.332077,
+-1.228006, -1.621955, -1.290402, -1.450506,
+-1.193881, -1.262186, -2.177005, -1.202168,
+-1.175372, -1.375904, -1.468919, -1.567765,
+-1.950576, -1.349322, -1.228101, -1.185662,
+-1.280631, -1.502751, -1.235290, -1.566041,
+-1.171693, -1.290947, -2.194797, -1.191495,
+-1.274402, -1.340609, -1.312623, -1.662826,
+-1.967650, -1.303773, -1.144584, -1.308026,
+-1.238674, -1.626367, -1.196804, -1.553857,
+-1.129809, -1.340694, -2.062479, -1.244439,
+-1.236807, -1.382682, -1.292338, -1.691963,
+-1.835764, -1.405753, -1.079123, -1.364797,
+-1.323445, -1.493448, -1.288839, -1.454163,
+-1.215349, -1.361060, -2.018891, -1.157719,
+-1.155703, -1.357054, -1.472780, -1.617175,
+-1.882063, -1.316876, -1.173670, -1.307397,
+-1.315221, -1.492384, -1.212882, -1.563461,
+-1.214255, -1.273706, -2.192850, -1.165785,
+-1.313993, -1.414489, -1.286171, -1.551741,
+-1.961482, -1.392654, -1.011403, -1.397450,
+-1.335823, -1.485556, -1.205618, -1.555068,
+-1.133775, -1.333475, -2.149755, -1.210287,
+-1.214825, -1.405137, -1.258111, -1.750343,
+-1.926924, -1.389562, -1.019072, -1.409398,
+-1.311909, -1.557272, -1.275130, -1.424662,
+-1.192664, -1.327152, -2.024904, -1.206141,
+-1.198954, -1.409969, -1.430266, -1.536679,
+-1.931305, -1.389328, -1.124030, -1.270003,
+-1.314019, -1.492302, -1.240221, -1.527455,
+-1.251524, -1.285326, -2.107530, -1.152542,
+-1.300381, -1.395389, -1.324847, -1.541818,
+-1.898156, -1.352703, -1.090108, -1.364788,
+-1.259778, -1.567958, -1.225586, -1.540719,
+-1.178854, -1.303931, -2.001480, -1.252564,
+-1.203059, -1.435712, -1.308999, -1.651878,
+-1.905552, -1.329180, -1.095817, -1.377140,
+-1.253340, -1.506193, -1.303005, -1.509666,
+-1.183445, -1.379405, -2.137629, -1.126655,
+-1.143417, -1.417405, -1.415807, -1.628620,
+-1.880313, -1.351320, -1.170636, -1.278733,
+-1.378113, -1.541940, -1.164197, -1.505946,
+-1.152680, -1.316425, -2.201758, -1.185773,
+-1.341049, -1.380789, -1.264509, -1.586352,
+-1.892528, -1.397002, -1.022309, -1.417954,
+-1.279542, -1.557940, -1.164217, -1.613954,
+-1.082378, -1.355672, -2.029950, -1.301604,
+-1.163300, -1.356178, -1.347533, -1.771825,
+-1.835875, -1.351161, -1.102318, -1.388352,
+-1.146963, -1.524594, -1.379366, -1.546685,
+-1.208717, -1.314682, -2.129774, -1.158375,
+-1.182817, -1.419290, -1.424094, -1.556022,
+-1.934235, -1.344714, -1.121917, -1.312315,
+-1.308775, -1.613864, -1.107114, -1.608334,
+-1.239839, -1.269777, -2.090889, -1.183851,
+-1.305755, -1.379864, -1.224980, -1.694691,
+-1.976807, -1.379114, -1.046739, -1.352446,
+-1.232734, -1.577741, -1.207558, -1.593870,
+-1.125126, -1.277180, -2.128886, -1.281588,
+-1.182010, -1.428445, -1.260578, -1.770978,
+-1.977632, -1.363765, -0.989588, -1.451677,
+-1.217143, -1.504825, -1.350675, -1.501464,
+-1.191802, -1.359712, -2.067271, -1.161357,
+-1.137430, -1.382941, -1.430169, -1.664688,
+-1.926072, -1.253446, -1.187115, -1.333090,
+-1.286474, -1.488182, -1.192377, -1.637451,
+-1.170953, -1.257732, -2.152635, -1.239763,
+-1.245443, -1.323486, -1.343159, -1.687610,
+-1.959891, -1.360895, -1.006611, -1.438684,
+-1.273022, -1.555102, -1.189528, -1.587271,
+-1.118625, -1.380870, -2.162857, -1.181183,
+-1.228220, -1.398602, -1.257361, -1.738175,
+-1.903167, -1.349486, -1.082423, -1.375327,
+-1.221820, -1.544594, -1.301544, -1.515052,
+-1.165304, -1.352944, -2.057355, -1.197652,
+-1.158052, -1.352031, -1.434697, -1.666004,
+-1.864785, -1.271503, -1.188305, -1.347369,
+-1.332790, -1.538602, -1.205572, -1.504815,
+-1.235042, -1.237779, -2.133729, -1.201460,
+-1.250648, -1.368246, -1.308502, -1.666522,
+-2.041594, -1.361308, -1.073626, -1.301636,
+-1.241334, -1.567631, -1.180710, -1.632695,
+-1.114944, -1.348686, -2.109522, -1.233782,
+-1.191864, -1.390971, -1.262138, -1.805067,
+-1.930587, -1.378604, -0.999284, -1.448749,
+-1.184943, -1.550223, -1.352443, -1.498665,
+-1.138147, -1.319696, -2.225242, -1.189643,
+-1.161145, -1.406201, -1.379253, -1.660555,
+-1.992129, -1.294027, -1.176061, -1.269616,
+-1.335732, -1.551955, -1.161545, -1.550232,
+-1.143119, -1.261060, -2.160425, -1.263909,
+-1.236042, -1.374689, -1.263193, -1.749849,
+-1.964725, -1.326528, -1.080145, -1.367032,
+-1.314958, -1.498754, -1.190488, -1.589472,
+-1.109372, -1.295064, -2.177212, -1.262321,
+-1.154009, -1.430919, -1.278129, -1.789864,
+-1.911490, -1.409475, -0.978958, -1.460303,
+-1.257805, -1.535490, -1.331208, -1.443015,
+-1.204381, -1.293993, -2.097141, -1.193555,
+-1.086273, -1.414530, -1.476800, -1.654848,
+-1.953186, -1.345019, -1.128507, -1.294161,
+-1.295965, -1.542663, -1.193443, -1.563606,
+-1.136555, -1.287349, -2.128798, -1.258445,
+-1.204402, -1.367173, -1.353795, -1.676410,
+-1.953570, -1.360134, -1.032193, -1.404941,
+-1.252570, -1.570009, -1.212777, -1.566298,
+-1.103838, -1.284952, -2.117861, -1.303972,
+-1.175205, -1.395554, -1.291124, -1.780305,
+-1.869861, -1.444978, -0.994159, -1.426829,
+-1.272390, -1.494225, -1.313812, -1.484469,
+-1.117812, -1.376013, -2.048106, -1.232912,
+-1.118877, -1.425666, -1.400782, -1.678782,
+-1.949080, -1.313091, -1.159180, -1.291477,
+-1.373569, -1.501337, -1.145812, -1.579875,
+-1.083641, -1.346729, -2.114707, -1.269826,
+-1.179178, -1.447547, -1.277759, -1.722265,
+-1.860929, -1.394579, -1.071749, -1.369927,
+-1.232469, -1.607163, -1.216965, -1.551900,
+-1.105273, -1.321237, -2.131390, -1.260887,
+-1.140045, -1.423428, -1.309988, -1.774990,
+-1.866370, -1.378914, -0.964264, -1.550224,
+-1.192733, -1.592270, -1.287449, -1.527067,
+-1.123608, -1.368874, -2.105725, -1.208166,
+-1.159472, -1.317628, -1.447954, -1.695390,
+-1.959774, -1.291360, -1.121356, -1.353342,
+-1.344762, -1.522366, -1.157585, -1.575234,
+-1.143925, -1.280531, -2.039570, -1.296602,
+-1.167426, -1.404442, -1.291783, -1.780530,
+-1.902910, -1.320489, -1.086018, -1.401191,
+-1.244550, -1.568846, -1.256606, -1.518566,
+-1.102203, -1.315206, -2.114703, -1.277356,
+-1.220229, -1.357242, -1.300165, -1.743077,
+-1.860241, -1.344257, -1.005582, -1.524180,
+-1.226580, -1.607230, -1.307670, -1.444711,
+-1.184415, -1.324177, -2.221705, -1.140594,
+-1.127270, -1.357307, -1.502724, -1.628583,
+-1.962710, -1.297184, -1.153205, -1.307118,
+-1.241912, -1.614629, -1.177454, -1.588939,
+-1.131039, -1.256202, -2.190382, -1.270488,
+-1.332897, -1.281022, -1.351370, -1.611195,
+-1.976682, -1.358736, -1.015939, -1.417066,
+-1.286079, -1.496800, -1.217086, -1.591053,
+-1.173725, -1.297439, -2.076677, -1.230281,
+-1.186318, -1.511356, -1.193846, -1.766136,
+-1.870724, -1.395831, -1.019831, -1.436725,
+-1.203594, -1.525526, -1.344367, -1.506544,
+-1.152749, -1.354442, -2.127900, -1.180732,
+-1.099725, -1.364654, -1.438427, -1.747163,
+-1.983889, -1.328545, -1.142600, -1.277922,
+-1.264054, -1.567240, -1.192330, -1.582989,
+-1.190102, -1.317033, -2.080248, -1.194084,
+-1.218647, -1.416745, -1.300140, -1.664029,
+-1.909439, -1.377665, -1.013977, -1.440132,
+-1.272303, -1.541736, -1.165010, -1.640488,
+-1.122468, -1.327667, -2.067111, -1.262738,
+-1.226633, -1.384418, -1.299659, -1.694846,
+-1.756831, -1.355510, -1.050128, -1.515491,
+-1.191069, -1.529253, -1.368276, -1.492273,
+-1.113337, -1.360246, -2.079036, -1.238253,
+-1.084030, -1.407228, -1.422732, -1.737871,
+-1.832353, -1.318674, -1.196532, -1.308470,
+-1.260968, -1.563898, -1.147420, -1.661537,
+-1.094947, -1.360120, -2.130928, -1.237579,
+-1.197458, -1.414129, -1.300737, -1.700617,
+-2.037443, -1.321966, -1.026429, -1.407545,
+-1.277906, -1.474202, -1.233987, -1.603014,
+-1.093261, -1.361431, -2.172361, -1.221879,
+-1.161734, -1.416973, -1.317476, -1.732738,
+-1.884504, -1.343809, -1.019047, -1.485865,
+-1.271815, -1.534963, -1.313182, -1.447102,
+-1.110041, -1.335651, -2.153549, -1.233072,
+-1.116229, -1.416630, -1.454124, -1.627845,
+-1.877874, -1.278426, -1.160106, -1.365876,
+-1.318412, -1.431471, -1.220522, -1.617435,
+-1.133459, -1.277234, -2.123088, -1.274348,
+-1.204404, -1.397124, -1.326506, -1.674407,
+-1.954158, -1.311541, -1.053643, -1.426090,
+-1.298379, -1.547059, -1.180596, -1.574761,
+-1.070732, -1.281244, -2.207345, -1.310970,
+-1.293351, -1.382035, -1.239656, -1.686865,
+-1.799749, -1.359254, -1.078914, -1.436146,
+-1.203973, -1.622633, -1.316154, -1.450483,
+-1.180461, -1.383891, -2.138063, -1.125844,
+-1.154160, -1.367559, -1.346569, -1.773024,
+-2.026101, -1.303002, -1.145578, -1.278709,
+-1.307400, -1.483397, -1.216489, -1.578214,
+-1.131774, -1.270256, -2.251085, -1.232734,
+-1.196219, -1.239918, -1.415920, -1.798454,
+-1.949115, -1.271538, -1.060161, -1.466204,
+-1.255666, -1.451374, -1.291582, -1.579733,
+-1.044565, -1.425441, -2.129500, -1.241828,
+-1.171273, -1.378714, -1.302224, -1.794567,
+-1.770930, -1.370742, -1.042873, -1.498495,
+-1.189383, -1.504077, -1.344499, -1.547880,
+-1.098204, -1.361672, -2.153644, -1.223367,
+-1.142846, -1.387898, -1.379905, -1.715422,
+-1.817285, -1.305906, -1.212409, -1.312572,
+-1.294710, -1.518819, -1.239627, -1.525281,
+-1.101437, -1.293258, -2.211018, -1.260042,
+-1.186253, -1.394982, -1.311322, -1.729711,
+-2.024442, -1.251498, -1.093562, -1.399033,
+-1.267367, -1.530952, -1.186700, -1.625204,
+-1.098612, -1.330312, -2.082973, -1.281169,
+-1.121176, -1.437797, -1.352605, -1.725479,
+-1.855953, -1.334138, -1.066742, -1.443148,
+-1.158898, -1.536001, -1.368044, -1.530980,
+-1.134798, -1.365275, -2.129703, -1.189642,
+-1.107623, -1.305412, -1.516044, -1.720274,
+-1.936491, -1.241783, -1.182719, -1.345238,
+-1.258447, -1.543638, -1.242367, -1.543638,
+-1.125164, -1.304118, -2.301412, -1.191150,
+-1.148520, -1.400237, -1.355942, -1.722320,
+-1.883632, -1.267998, -1.131486, -1.410525,
+-1.262612, -1.513926, -1.241912, -1.569208,
+-1.007244, -1.363919, -2.251887, -1.294926,
+-1.181868, -1.379694, -1.373136, -1.669571,
+-1.793031, -1.322709, -1.129583, -1.410733,
+-1.176794, -1.577667, -1.459502, -1.374656,
+-1.185036, -1.331292, -2.261440, -1.121076,
+-1.121170, -1.376839, -1.485661, -1.632841,
+-1.793512, -1.365496, -1.233897, -1.247523,
+-1.252226, -1.550393, -1.266089, -1.513965,
+-1.070613, -1.349927, -2.178875, -1.256061,
+-1.213108, -1.417831, -1.357549, -1.593572,
+-1.856071, -1.411138, -1.111148, -1.306862,
+-1.219528, -1.572057, -1.301152, -1.492568,
+-0.993210, -1.443255, -2.295896, -1.228316,
+-1.112406, -1.452069, -1.423229, -1.628556,
+-1.763656, -1.478352, -1.059877, -1.370139,
+-1.122953, -1.642972, -1.471964, -1.379106,
+-1.051087, -1.404152, -2.400110, -1.157855,
+-1.030281, -1.375599, -1.646510, -1.621085,
+-1.724012, -1.518701, -1.236717, -1.163732,
+-1.117603, -1.599586, -1.315587, -1.596323,
+-0.968354, -1.333230, -2.393915, -1.326530,
+-0.973824, -1.341693, -1.554107, -1.899903,
+-1.563394, -1.337397, -1.211732, -1.468084,
+-0.998394, -1.139395, -1.632676, -2.153211,
+-0.818638, -1.064793, -2.641517, -1.945501,
+-0.946117, -1.071815, -1.639264, -2.587007,
+-1.473391, -0.957669, -1.381619, -1.995872,
+-0.454442, -2.214835, -2.129729, -1.986628,
+-0.293988, -2.510470, -2.991967, -2.093170,
+-0.434752, -2.092543, -2.115746, -2.219543,
+-1.262582, -1.757518, -1.302403, -1.299283,
+-2.384259, -3.736880, -0.175496, -3.101671,
+-1.513713, -2.694443, -0.710651, -1.509572,
+-1.858238, -2.956849, -0.358565, -2.370948,
+-3.019029, -3.316280, -0.183299, -2.496840,
+-1.242298, 0.000100, 0.000100, 0.000100,
+-1.578327, 0.000100, 0.000100, 0.000100,
+-1.574650, 0.000100, 0.000100, 0.000100,
+-1.211065, 0.000100, 0.000100, 0.000100,
+-1.123866, -1.720387, -1.462557, -1.330522,
+-1.061131, -1.365952, -2.800505, -1.084654,
+-1.277848, -1.565415, -1.413511, -1.312766,
+-1.546281, -1.581237, -1.370970, -1.116633,
+-1.121998, -1.731069, -1.444373, -1.341770,
+-1.072902, -1.392056, -2.773738, -1.058449,
+-1.251255, -1.582163, -1.417443, -1.324290,
+-1.546320, -1.602778, -1.323177, -1.141483,
+-1.115748, -1.771768, -1.417554, -1.347106,
+-1.027126, -1.426483, -2.812371, -1.073672,
+-1.255814, -1.589083, -1.401257, -1.328946,
+-1.538923, -1.620794, -1.357019, -1.107999,
+-1.135400, -1.732635, -1.423153, -1.343483,
+-1.024090, -1.400305, -2.784571, -1.100817,
+-1.252002, -1.569625, -1.407929, -1.342103,
+-1.533882, -1.617371, -1.356280, -1.113930,
+-1.113697, -1.743936, -1.445264, -1.342689,
+-1.046225, -1.395464, -2.825490, -1.073702,
+-1.244452, -1.593912, -1.398987, -1.339697,
+-1.507638, -1.608896, -1.385753, -1.113755,
+-1.105669, -1.761565, -1.449055, -1.337722,
+-1.054503, -1.410613, -2.827225, -1.054220,
+-1.262306, -1.589100, -1.416800, -1.307848,
+-1.502097, -1.610379, -1.359177, -1.137395,
+-1.111209, -1.742705, -1.471572, -1.323399,
+-1.014036, -1.397978, -2.794367, -1.111703,
+-1.280201, -1.545798, -1.429274, -1.311580,
+-1.519826, -1.601706, -1.373000, -1.119780,
+-1.106236, -1.761038, -1.446936, -1.339252,
+-1.028121, -1.397359, -2.841623, -1.088439,
+-1.261246, -1.559904, -1.400853, -1.346473,
+-1.540602, -1.622975, -1.322681, -1.133152,
+-1.118192, -1.734510, -1.485008, -1.308806,
+-1.038139, -1.393620, -2.800799, -1.087820,
+-1.287914, -1.550760, -1.397762, -1.328334,
+-1.526613, -1.623579, -1.375097, -1.100446,
+-1.103879, -1.737740, -1.456957, -1.348782,
+-1.050339, -1.405937, -2.801846, -1.066109,
+-1.268099, -1.557169, -1.425492, -1.318590,
+-1.538100, -1.615235, -1.324717, -1.137907,
+-1.104633, -1.760179, -1.452246, -1.337087,
+-0.996166, -1.448341, -2.850608, -1.084891,
+-1.282216, -1.549764, -1.403366, -1.329869,
+-1.551146, -1.580816, -1.345634, -1.133783,
+-1.123336, -1.724506, -1.454437, -1.335548,
+-1.060020, -1.387279, -2.826546, -1.065429,
+-1.260617, -1.575648, -1.382175, -1.352412,
+-1.521082, -1.616763, -1.369807, -1.112220,
+-1.124388, -1.737787, -1.458988, -1.321376,
+-1.042676, -1.386099, -2.818764, -1.085416,
+-1.255847, -1.549402, -1.433181, -1.330891,
+-1.533449, -1.604620, -1.382278, -1.101905,
+-1.108604, -1.721001, -1.486421, -1.328354,
+-1.016683, -1.416133, -2.804428, -1.093580,
+-1.264356, -1.579386, -1.387669, -1.340095,
+-1.529165, -1.618806, -1.345768, -1.124519,
+-1.109669, -1.714631, -1.478223, -1.338422,
+-1.076822, -1.359895, -2.818472, -1.070204,
+-1.278541, -1.593057, -1.396585, -1.306326,
+-1.514716, -1.600112, -1.401508, -1.102529,
+-1.123227, -1.737167, -1.440377, -1.339737,
+-1.020192, -1.388811, -2.905761, -1.092445,
+-1.270186, -1.547523, -1.424512, -1.324935,
+-1.519133, -1.601961, -1.371069, -1.121588,
+-1.118991, -1.737281, -1.440285, -1.345028,
+-1.049255, -1.389887, -2.818301, -1.075900,
+-1.259663, -1.580311, -1.441641, -1.295357,
+-1.498808, -1.598859, -1.390409, -1.122383,
+-1.108109, -1.741473, -1.444166, -1.352425,
+-1.054054, -1.393443, -2.880565, -1.057981,
+-1.262425, -1.545494, -1.445718, -1.315837,
+-1.539910, -1.609304, -1.352250, -1.118054,
+-1.128252, -1.759558, -1.460806, -1.301103,
+-1.025305, -1.389807, -2.813540, -1.102027,
+-1.287016, -1.544535, -1.429386, -1.305489,
+-1.545225, -1.611647, -1.365258, -1.103034,
+-1.123786, -1.754212, -1.437073, -1.330719,
+-1.023151, -1.414845, -2.811653, -1.086277,
+-1.258383, -1.559998, -1.414365, -1.336855,
+-1.538351, -1.607586, -1.372032, -1.104721,
+-1.128465, -1.722574, -1.449851, -1.334615,
+-1.043194, -1.382463, -2.793372, -1.092138,
+-1.256340, -1.597638, -1.388420, -1.333844,
+-1.554425, -1.633045, -1.353610, -1.093478,
+-1.120322, -1.735018, -1.434417, -1.350254,
+-1.022833, -1.408785, -2.694076, -1.113583,
+-1.269074, -1.544047, -1.404887, -1.347081,
+-1.537799, -1.639127, -1.349640, -1.103634,
+-1.138117, -1.760881, -1.420579, -1.323843,
+-1.032299, -1.430335, -2.765336, -1.074030,
+-1.288399, -1.587817, -1.396279, -1.300482,
+-1.502892, -1.626594, -1.382236, -1.108967,
+-1.107639, -1.735777, -1.445745, -1.355462,
+-1.046808, -1.382197, -2.772889, -1.092324,
+-1.265892, -1.568253, -1.402187, -1.333531,
+-1.536628, -1.620842, -1.335678, -1.126448,
+-1.134182, -1.736283, -1.429190, -1.336978,
+-1.008206, -1.427035, -2.891742, -1.079904,
+-1.274805, -1.533251, -1.437623, -1.319795,
+-1.544505, -1.616730, -1.364998, -1.100651,
+-1.135816, -1.717987, -1.427807, -1.348695,
+-1.037170, -1.395939, -2.869820, -1.075187,
+-1.281652, -1.563178, -1.391645, -1.330722,
+-1.561785, -1.564969, -1.351566, -1.132243,
+-1.135276, -1.753692, -1.446481, -1.308830,
+-1.049466, -1.397293, -2.844794, -1.065767,
+-1.267686, -1.575997, -1.412350, -1.316223,
+-1.525381, -1.640799, -1.342801, -1.116196,
+-1.151836, -1.735897, -1.440763, -1.305803,
+-1.043935, -1.348271, -2.878841, -1.102485,
+-1.274314, -1.572029, -1.376429, -1.346015,
+-1.558161, -1.611757, -1.320351, -1.130417,
+-1.133673, -1.740466, -1.428679, -1.335269,
+-1.039697, -1.437934, -2.813282, -1.052647,
+-1.248541, -1.588230, -1.433751, -1.307938,
+-1.525019, -1.626636, -1.361401, -1.110204,
+-1.108592, -1.740549, -1.441143, -1.355201,
+-1.044865, -1.402472, -2.798538, -1.074778,
+-1.254115, -1.562304, -1.428485, -1.326715,
+-1.503085, -1.654462, -1.358160, -1.111003,
+-1.130063, -1.710850, -1.461723, -1.330146,
+-1.049879, -1.394269, -2.794267, -1.076325,
+-1.267423, -1.546161, -1.443419, -1.312077,
+-1.530486, -1.598548, -1.357101, -1.127092,
+-1.132145, -1.701283, -1.449039, -1.345483,
+-1.040908, -1.413072, -2.816953, -1.068033,
+-1.264140, -1.586210, -1.403335, -1.320349,
+-1.536345, -1.619063, -1.349468, -1.116660,
+-1.138421, -1.714675, -1.452341, -1.325645,
+-1.047573, -1.395043, -2.787677, -1.079322,
+-1.279992, -1.570790, -1.419219, -1.301303,
+-1.531744, -1.629160, -1.357032, -1.107682,
+-1.131961, -1.693852, -1.454590, -1.345947,
+-1.055820, -1.387077, -2.857468, -1.064574,
+-1.279489, -1.571223, -1.394330, -1.324145,
+-1.565953, -1.593316, -1.341349, -1.119729,
+-1.141971, -1.716747, -1.439104, -1.331727,
+-1.048025, -1.405601, -2.764073, -1.075531,
+-1.250373, -1.618283, -1.394657, -1.318841,
+-1.548085, -1.614809, -1.367391, -1.097675,
+-1.132050, -1.744872, -1.436953, -1.326855,
+-1.013179, -1.411746, -2.870563, -1.089001,
+-1.275670, -1.576496, -1.396809, -1.321732,
+-1.518169, -1.600788, -1.366269, -1.126724,
+-1.141804, -1.711171, -1.434892, -1.339555,
+-1.053648, -1.398157, -2.769449, -1.074150,
+-1.288803, -1.567119, -1.428879, -1.286714,
+-1.526594, -1.633157, -1.348661, -1.115273,
+-1.110227, -1.721046, -1.450828, -1.357665,
+-1.025323, -1.417910, -2.856881, -1.073963,
+-1.267067, -1.564645, -1.414618, -1.323647,
+-1.519019, -1.621279, -1.333260, -1.140025,
+-1.121248, -1.710170, -1.457343, -1.345390,
+-1.049402, -1.411537, -2.786622, -1.065791,
+-1.269033, -1.583067, -1.403392, -1.317551,
+-1.537491, -1.607340, -1.381091, -1.098544,
+-1.116858, -1.754235, -1.451849, -1.326077,
+-1.034520, -1.408438, -2.783995, -1.083861,
+-1.250146, -1.583102, -1.410356, -1.331262,
+-1.539492, -1.599903, -1.338173, -1.135475,
+-1.129279, -1.722656, -1.448744, -1.334547,
+-1.049119, -1.409215, -2.815054, -1.062713,
+-1.294202, -1.551007, -1.433424, -1.289683,
+-1.525423, -1.604666, -1.368136, -1.117999,
+-1.123871, -1.737322, -1.470037, -1.312774,
+-1.040044, -1.438515, -2.792138, -1.055578,
+-1.280080, -1.579120, -1.402545, -1.309859,
+-1.509734, -1.614094, -1.345595, -1.140762,
+-1.120660, -1.754441, -1.444765, -1.327535,
+-1.074713, -1.373294, -2.828861, -1.060586,
+-1.276879, -1.568597, -1.427469, -1.298866,
+-1.536595, -1.602742, -1.372685, -1.108304,
+-1.137033, -1.730855, -1.448616, -1.319747,
+-1.060341, -1.415689, -2.805606, -1.048583,
+-1.282428, -1.600270, -1.391641, -1.301483,
+-1.532328, -1.611555, -1.362008, -1.113996,
+-1.147518, -1.719185, -1.428671, -1.332803,
+-1.044867, -1.409073, -2.849344, -1.061289,
+-1.288718, -1.581987, -1.426637, -1.277638,
+-1.528099, -1.640736, -1.349071, -1.109469,
+-1.134894, -1.705976, -1.423231, -1.362521,
+-1.045581, -1.389165, -2.805240, -1.082532,
+-1.278257, -1.601620, -1.384382, -1.311448,
+-1.557634, -1.638720, -1.337812, -1.100459,
+-1.129551, -1.752663, -1.429178, -1.331791,
+-1.056237, -1.414732, -2.765237, -1.060489,
+-1.225204, -1.631949, -1.423710, -1.309573,
+-1.518164, -1.617299, -1.353250, -1.126848,
+-1.132538, -1.719471, -1.437950, -1.342429,
+-1.060783, -1.400616, -2.817022, -1.056741,
+-1.268395, -1.596352, -1.385245, -1.324934,
+-1.516963, -1.626723, -1.353563, -1.121681,
+-1.132348, -1.693098, -1.445234, -1.354467,
+-1.044467, -1.397903, -2.830356, -1.072902,
+-1.318915, -1.544277, -1.443369, -1.262387,
+-1.528711, -1.603032, -1.363463, -1.120471,
+-1.133361, -1.706959, -1.459906, -1.330383,
+-1.028045, -1.432302, -2.770285, -1.076191,
+-1.284631, -1.553543, -1.405709, -1.322169,
+-1.512762, -1.627493, -1.322521, -1.149437,
+-1.127699, -1.761013, -1.432091, -1.325979,
+-1.060534, -1.397629, -2.818619, -1.058838,
+-1.314872, -1.598540, -1.423470, -1.243327,
+-1.542966, -1.632017, -1.353518, -1.101449,
+-1.142571, -1.732257, -1.430842, -1.327987,
+-1.037607, -1.426054, -2.793440, -1.066437,
+-1.266382, -1.618981, -1.395934, -1.300307,
+-1.514827, -1.623753, -1.335079, -1.139874,
+-1.104185, -1.764759, -1.475759, -1.314213,
+-1.065700, -1.396333, -2.793175, -1.059054,
+-1.293279, -1.608780, -1.414928, -1.263880,
+-1.524227, -1.647857, -1.351876, -1.105654,
+-1.158217, -1.711439, -1.431770, -1.322520,
+-1.064138, -1.408448, -2.711574, -1.067055,
+-1.297836, -1.592969, -1.369635, -1.311718,
+-1.550784, -1.668413, -1.336807, -1.088595,
+-1.139800, -1.754740, -1.437111, -1.310995,
+-1.041886, -1.425671, -2.804109, -1.060440,
+-1.313742, -1.580650, -1.395545, -1.281399,
+-1.526701, -1.641460, -1.366684, -1.096312,
+-1.128676, -1.720953, -1.439682, -1.344615,
+-1.044461, -1.415129, -2.770190, -1.071267,
+-1.274958, -1.572709, -1.370952, -1.350118,
+-1.547719, -1.608459, -1.328143, -1.132885,
+-1.124899, -1.776905, -1.456387, -1.298002,
+-1.046609, -1.458791, -2.842561, -1.026998,
+-1.300217, -1.628569, -1.425220, -1.234908,
+-1.519043, -1.618752, -1.383119, -1.102205,
+-1.129110, -1.712391, -1.451658, -1.339168,
+-1.037982, -1.409258, -2.805602, -1.075766,
+-1.306689, -1.538553, -1.349563, -1.365551,
+-1.534368, -1.621379, -1.332399, -1.130293,
+-1.126573, -1.765379, -1.452495, -1.306559,
+-1.050783, -1.390100, -2.806970, -1.076172,
+-1.311387, -1.588396, -1.393596, -1.279703,
+-1.554273, -1.630875, -1.354944, -1.093813,
+-1.125414, -1.766618, -1.428970, -1.327964,
+-1.019173, -1.403413, -2.748562, -1.110695,
+-1.293193, -1.602554, -1.358945, -1.319408,
+-1.537744, -1.635890, -1.295300, -1.150305,
+-1.139669, -1.727977, -1.435710, -1.329970,
+-1.047735, -1.410837, -2.836927, -1.059226,
+-1.315044, -1.616241, -1.443033, -1.215091,
+-1.509565, -1.601313, -1.377765, -1.123250,
+-1.147084, -1.706149, -1.415484, -1.354537,
+-1.019558, -1.411698, -2.764523, -1.101084,
+-1.269213, -1.574792, -1.387169, -1.338971,
+-1.559372, -1.646411, -1.301203, -1.124584,
+-1.147703, -1.729602, -1.436708, -1.318370,
+-1.055289, -1.385604, -2.780210, -1.079663,
+-1.351219, -1.531050, -1.461460, -1.228028,
+-1.549385, -1.617898, -1.361534, -1.099496,
+-1.151407, -1.754948, -1.411830, -1.319776,
+-1.050915, -1.401599, -2.688128, -1.090135,
+-1.241243, -1.573631, -1.409243, -1.349553,
+-1.545278, -1.646472, -1.275865, -1.155725,
+-1.115330, -1.725386, -1.467504, -1.333365,
+-1.013765, -1.454776, -2.790033, -1.072132,
+-1.297666, -1.591738, -1.465940, -1.229635,
+-1.566106, -1.635796, -1.326129, -1.106112,
+-1.132382, -1.694966, -1.435779, -1.361797,
+-1.035082, -1.415765, -2.750601, -1.084196,
+-1.263016, -1.542865, -1.379261, -1.379645,
+-1.538440, -1.652647, -1.277014, -1.155586,
+-1.158534, -1.717264, -1.426202, -1.323216,
+-1.035753, -1.442324, -2.874427, -1.043514,
+-1.320681, -1.606422, -1.415923, -1.238768,
+-1.559786, -1.633414, -1.371349, -1.076480,
+-1.132807, -1.742767, -1.395414, -1.366075,
+-1.040805, -1.433209, -2.716573, -1.072399,
+-1.302183, -1.590220, -1.350175, -1.328075,
+-1.528791, -1.664106, -1.280650, -1.152057,
+-1.155969, -1.720721, -1.472503, -1.283883,
+-1.041538, -1.462711, -2.761848, -1.043255,
+-1.365532, -1.606216, -1.425608, -1.191558,
+-1.565445, -1.628702, -1.355364, -1.087768,
+-1.183324, -1.735393, -1.405177, -1.301636,
+-1.029218, -1.411087, -2.845431, -1.076675,
+-1.293637, -1.628141, -1.329458, -1.328729,
+-1.570520, -1.639295, -1.281771, -1.138183,
+-1.184483, -1.772223, -1.389960, -1.290782,
+-1.045807, -1.419826, -2.801157, -1.061039,
+-1.367126, -1.625682, -1.400528, -1.197602,
+-1.553913, -1.626613, -1.335963, -1.111449,
+-1.147511, -1.738507, -1.419002, -1.328644,
+-1.028389, -1.418593, -2.796172, -1.080812,
+-1.237031, -1.594943, -1.390488, -1.355255,
+-1.577765, -1.673013, -1.259595, -1.132848,
+-1.188812, -1.715508, -1.409654, -1.304466,
+-1.065507, -1.413570, -2.871109, -1.034124,
+-1.355146, -1.621385, -1.426527, -1.189662,
+-1.563022, -1.578627, -1.369474, -1.108665,
+-1.172417, -1.736817, -1.379024, -1.337588,
+-1.007130, -1.421078, -2.740301, -1.112520,
+-1.264406, -1.579098, -1.332746, -1.395617,
+-1.567721, -1.652838, -1.233003, -1.175845,
+-1.203550, -1.698913, -1.401067, -1.306903,
+-1.042003, -1.405527, -2.856929, -1.065459,
+-1.329379, -1.580363, -1.514550, -1.172633,
+-1.533140, -1.591912, -1.366855, -1.121780,
+-1.189564, -1.653730, -1.417380, -1.339552,
+-1.076494, -1.348020, -2.793821, -1.083908,
+-1.335665, -1.489445, -1.400451, -1.327900,
+-1.564922, -1.589900, -1.259134, -1.193858,
+-1.165974, -1.762895, -1.387537, -1.319968,
+-1.060081, -1.440125, -2.778800, -1.036874,
+-1.356823, -1.631915, -1.476295, -1.144179,
+-1.552356, -1.644394, -1.317984, -1.116427,
+-1.128674, -1.567696, -1.470539, -1.434596,
+-0.991671, -1.306884, -2.611551, -1.255403,
+-1.266786, -1.487478, -1.395204, -1.408377,
+-1.601340, -1.509013, -1.205371, -1.281340,
+-1.131920, -2.753591, -1.393460, -1.006017,
+-0.901514, -2.705863, -2.954560, -0.744164,
+-1.265302, -2.731009, -1.407433, -0.896680,
+-1.460255, -2.900763, -1.202876, -0.885489};
+
+double score_in_don[NUM_VALUES_SCORES] =
+{-0.478520, 0.000100, 0.000100, 0.000100,
+-3.470547, 0.000100, 0.000100, 0.000100,
+-1.146963, 0.000100, 0.000100, 0.000100,
+-3.454598, 0.000100, 0.000100, 0.000100,
+-0.415031, -2.375713, -2.259474, -1.949628,
+-0.328148, -2.520914, -3.174837, -1.847972,
+-0.183708, -3.100533, -2.491223, -3.219190,
+-0.935461, -2.922524, -1.451677, -1.140622,
+-2.447088, -3.145141, -0.202256, -2.927916,
+-1.341329, -1.830612, -0.989187, -1.578387,
+-2.560945, -3.098087, -0.182121, -3.120559,
+-2.420583, -2.470345, -0.358279, -2.058100,
+-1.370294, -1.902511, -1.226400, -1.192613,
+-1.203973, -1.382792, -2.171248, -1.093363,
+-1.716096, -2.018802, -1.959097, -0.604343,
+-1.670320, -2.090491, -1.139263, -0.999314,
+-1.061786, -1.735801, -1.364941, -1.502794,
+-1.011423, -1.508000, -2.117894, -1.221927,
+-1.121546, -1.432701, -1.521532, -1.527011,
+-1.171592, -1.915998, -1.144701, -1.493377,
+-1.360640, -1.488907, -1.564893, -1.175145,
+-1.235830, -1.235830, -2.437877, -1.104227,
+-1.434473, -1.351348, -1.381335, -1.379815,
+-1.703031, -1.583154, -1.339306, -1.048351,
+-1.244275, -1.529555, -1.482686, -1.316055,
+-1.292946, -1.202040, -2.470091, -1.077688,
+-1.391529, -1.361451, -1.216870, -1.615149,
+-1.828850, -1.450746, -1.320862, -1.084409,
+-1.227998, -1.585702, -1.417540, -1.347041,
+-1.264398, -1.287256, -2.297413, -1.075741,
+-1.543335, -1.419175, -1.116076, -1.528520,
+-1.754513, -1.551330, -1.286488, -1.082335,
+-1.185089, -1.636288, -1.392199, -1.382098,
+-1.196416, -1.309063, -2.332659, -1.106804,
+-1.515828, -1.347139, -1.162641, -1.571509,
+-1.763248, -1.524356, -1.332962, -1.058267,
+-1.240804, -1.586059, -1.412742, -1.336961,
+-1.214265, -1.328283, -2.226224, -1.108042,
+-1.498936, -1.398395, -1.172126, -1.514440,
+-1.773840, -1.670736, -1.297199, -0.997189,
+-1.269479, -1.582050, -1.406810, -1.314735,
+-1.290538, -1.268065, -2.425517, -1.035443,
+-1.530734, -1.437208, -1.178973, -1.433643,
+-1.763087, -1.526391, -1.403493, -1.006682,
+-1.243740, -1.555666, -1.404148, -1.366212,
+-1.256408, -1.207976, -2.514448, -1.091786,
+-1.503633, -1.421132, -1.193211, -1.456749,
+-1.745093, -1.608793, -1.326772, -1.021902,
+-1.259102, -1.638934, -1.326612, -1.360513,
+-1.268417, -1.248940, -2.424632, -1.068847,
+-1.447361, -1.453016, -1.188763, -1.485680,
+-1.712074, -1.579692, -1.415243, -0.992622,
+-1.264235, -1.647912, -1.350045, -1.324767,
+-1.299575, -1.237346, -2.416001, -1.055779,
+-1.544733, -1.567819, -1.077587, -1.436679,
+-1.729687, -1.591537, -1.316842, -1.046777,
+-1.231284, -1.622855, -1.382004, -1.348331,
+-1.277873, -1.307432, -2.381351, -1.026019,
+-1.508645, -1.506774, -1.163648, -1.407224,
+-1.717651, -1.672881, -1.375180, -0.967550,
+-1.184668, -1.671036, -1.454626, -1.299704,
+-1.270001, -1.253687, -2.391908, -1.072175,
+-1.531260, -1.531260, -1.103382, -1.445082,
+-1.821439, -1.620947, -1.293472, -1.004632,
+-1.247869, -1.592098, -1.386294, -1.349439,
+-1.285058, -1.285058, -2.422428, -1.027153,
+-1.577637, -1.420008, -1.169465, -1.421745,
+-1.678523, -1.571602, -1.419905, -1.010872,
+-1.226678, -1.685121, -1.385442, -1.305269,
+-1.277200, -1.244964, -2.422109, -1.065668,
+-1.614304, -1.503195, -1.127262, -1.367929,
+-1.658876, -1.550967, -1.359779, -1.076204,
+-1.276047, -1.606486, -1.405050, -1.291291,
+-1.339533, -1.190471, -2.533962, -1.036737,
+-1.521923, -1.602131, -1.119097, -1.371733,
+-1.681636, -1.615341, -1.350955, -1.032501,
+-1.175866, -1.686227, -1.453386, -1.300286,
+-1.244745, -1.261973, -2.624847, -1.031589,
+-1.568703, -1.556306, -1.078886, -1.423802,
+-1.676537, -1.560465, -1.356065, -1.063433,
+-1.256683, -1.703513, -1.360277, -1.284565,
+-1.292962, -1.301247, -2.324564, -1.034159,
+-1.614676, -1.499083, -1.086496, -1.425811,
+-1.667153, -1.636538, -1.341347, -1.035440,
+-1.241680, -1.714656, -1.301285, -1.351336,
+-1.232077, -1.294495, -2.448871, -1.055879,
+-1.524567, -1.503870, -1.171130, -1.386294,
+-1.691123, -1.745093, -1.287411, -1.007655,
+-1.265136, -1.682488, -1.349867, -1.299700,
+-1.204612, -1.356103, -2.345644, -1.059171,
+-1.483668, -1.571011, -1.124089, -1.425400,
+-1.682649, -1.553702, -1.317474, -1.094089,
+-1.223528, -1.620625, -1.345522, -1.395816,
+-1.205999, -1.341092, -2.454092, -1.040987,
+-1.488605, -1.520002, -1.120181, -1.472374,
+-1.641671, -1.554229, -1.385961, -1.064477,
+-1.240167, -1.667425, -1.369139, -1.318241,
+-1.250047, -1.285477, -2.415732, -1.056449,
+-1.541859, -1.535936, -1.135734, -1.387991,
+-1.688436, -1.629647, -1.323538, -1.041284,
+-1.295957, -1.648076, -1.370182, -1.273382,
+-1.162504, -1.300053, -2.605001, -1.076232,
+-1.562899, -1.535119, -1.146096, -1.358030,
+-1.695322, -1.629894, -1.248039, -1.098286,
+-1.270045, -1.623324, -1.353815, -1.332997,
+-1.228781, -1.376632, -2.537588, -0.978520,
+-1.528723, -1.521075, -1.151155, -1.392556,
+-1.608123, -1.683116, -1.322907, -1.056752,
+-1.207401, -1.720060, -1.348813, -1.337798,
+-1.269988, -1.263254, -2.572090, -1.021493,
+-1.495005, -1.561435, -1.171580, -1.362419,
+-1.628337, -1.661785, -1.326772, -1.053796,
+-1.220525, -1.688599, -1.349927, -1.343667,
+-1.204761, -1.314689, -2.348542, -1.090138,
+-1.472608, -1.533930, -1.182606, -1.392280,
+-1.621516, -1.673760, -1.309502, -1.064484,
+-1.234769, -1.661025, -1.421215, -1.281093,
+-1.265666, -1.342627, -2.400272, -1.004857,
+-1.550538, -1.628446, -1.105751, -1.344502,
+-1.679551, -1.590604, -1.304970, -1.083057,
+-1.228979, -1.694131, -1.355383, -1.325005,
+-1.210006, -1.317491, -2.476396, -1.049958,
+-1.448170, -1.508462, -1.143348, -1.491655,
+-1.662343, -1.643839, -1.295533, -1.069100,
+-1.199949, -1.693521, -1.384683, -1.329853,
+-1.167471, -1.309646, -2.483970, -1.092073,
+-1.494983, -1.556070, -1.117054, -1.437413,
+-1.653371, -1.620581, -1.337152, -1.054920,
+-1.227113, -1.743366, -1.341687, -1.307381,
+-1.206638, -1.305565, -2.476022, -1.062178,
+-1.491702, -1.630339, -1.100725, -1.400322,
+-1.581649, -1.657893, -1.343288, -1.070497,
+-1.194240, -1.712510, -1.325530, -1.382337,
+-1.202348, -1.366014, -2.466539, -1.023007,
+-1.473154, -1.602804, -1.109066, -1.428782,
+-1.632520, -1.686399, -1.271455, -1.081888,
+-1.266135, -1.699409, -1.363557, -1.274610,
+-1.201668, -1.252269, -2.464348, -1.113560,
+-1.426111, -1.577752, -1.188665, -1.391539,
+-1.647515, -1.658060, -1.272821, -1.087899,
+-1.219364, -1.781363, -1.343802, -1.289816,
+-1.236605, -1.274721, -2.411203, -1.077588,
+-1.426147, -1.611935, -1.151088, -1.410696,
+-1.670858, -1.658479, -1.265393, -1.080701,
+-1.176248, -1.772666, -1.361700, -1.326284,
+-1.145826, -1.372794, -2.567866, -1.044290,
+-1.489247, -1.513124, -1.195684, -1.379360,
+-1.580593, -1.613437, -1.359714, -1.084086,
+-1.179716, -1.760252, -1.329693, -1.362289,
+-1.204934, -1.281510, -2.464934, -1.085727,
+-1.446594, -1.590971, -1.184088, -1.366976,
+-1.622109, -1.607121, -1.311619, -1.100602,
+-1.203973, -1.807508, -1.370790, -1.265737,
+-1.232484, -1.239041, -2.519317, -1.083951,
+-1.423066, -1.512298, -1.236110, -1.393880,
+-1.706609, -1.604045, -1.272387, -1.086853,
+-1.228766, -1.655022, -1.320652, -1.389108,
+-1.258834, -1.217094, -2.451936, -1.097217,
+-1.410517, -1.553348, -1.227029, -1.381179,
+-1.616413, -1.542544, -1.343845, -1.118673,
+-1.185551, -1.734468, -1.365009, -1.337314,
+-1.173233, -1.285471, -2.532632, -1.094626,
+-1.522534, -1.558683, -1.178763, -1.332680,
+-1.633058, -1.612188, -1.309982, -1.092452,
+-1.171691, -1.657834, -1.403769, -1.370669,
+-1.174450, -1.300220, -2.536791, -1.080519,
+-1.441655, -1.584336, -1.159452, -1.407632,
+-1.663487, -1.683008, -1.279935, -1.059430,
+-1.189139, -1.703385, -1.357474, -1.362030,
+-1.169648, -1.307571, -2.593446, -1.066283,
+-1.406878, -1.566242, -1.225135, -1.376159,
+-1.666063, -1.639769, -1.324782, -1.046614,
+-1.218194, -1.691620, -1.386683, -1.308916,
+-1.161347, -1.315245, -2.528106, -1.082589,
+-1.398978, -1.575681, -1.187166, -1.422131,
+-1.638318, -1.655501, -1.296748, -1.075059,
+-1.190472, -1.789820, -1.340375, -1.319816,
+-1.217771, -1.332328, -2.504212, -1.025803,
+-1.517203, -1.571270, -1.168552, -1.339114,
+-1.651736, -1.690817, -1.261485, -1.076808,
+-1.230710, -1.662657, -1.375505, -1.325882,
+-1.182748, -1.266972, -2.586972, -1.088760,
+-1.452929, -1.586965, -1.209498, -1.334753,
+-1.551249, -1.616300, -1.293979, -1.154526,
+-1.210366, -1.722064, -1.367357, -1.315145,
+-1.171574, -1.374252, -2.559373, -1.022376,
+-1.417919, -1.611572, -1.220455, -1.335172,
+-1.578029, -1.646082, -1.301095, -1.112972,
+-1.190627, -1.721002, -1.354930, -1.350475,
+-1.205775, -1.263126, -2.544059, -1.081094,
+-1.378114, -1.591443, -1.189522, -1.427333,
+-1.614376, -1.659954, -1.231159, -1.142930,
+-1.164000, -1.761725, -1.372115, -1.337706,
+-1.158192, -1.270851, -2.690048, -1.086261,
+-1.425400, -1.574987, -1.218511, -1.359040,
+-1.611861, -1.691182, -1.290104, -1.076047,
+-1.192362, -1.705701, -1.370427, -1.343798,
+-1.156881, -1.304133, -2.623218, -1.074238,
+-1.381117, -1.533262, -1.278170, -1.369141,
+-1.570153, -1.680341, -1.294176, -1.103969,
+-1.215735, -1.721485, -1.356197, -1.320248,
+-1.225687, -1.319372, -2.636673, -1.000918,
+-1.430189, -1.599409, -1.187458, -1.371505,
+-1.543365, -1.642620, -1.316136, -1.124862,
+-1.191911, -1.707851, -1.402695, -1.312408,
+-1.169159, -1.323569, -2.482012, -1.079947,
+-1.415080, -1.547348, -1.237193, -1.370089,
+-1.666285, -1.696509, -1.254419, -1.071517,
+-1.184525, -1.722064, -1.398463, -1.315145,
+-1.154293, -1.399927, -2.584481, -1.014264,
+-1.359783, -1.543435, -1.232083, -1.435682,
+-1.656493, -1.651308, -1.290503, -1.072179,
+-1.217448, -1.722690, -1.341061, -1.332328,
+-1.203485, -1.371278, -2.489308, -1.013066,
+-1.362713, -1.640277, -1.240046, -1.344241,
+-1.596653, -1.618053, -1.260879, -1.152845,
+-1.249875, -1.703592, -1.302373, -1.348825,
+-1.136674, -1.334117, -2.518384, -1.093222,
+-1.393040, -1.534580, -1.264597, -1.371282,
+-1.642395, -1.611780, -1.306663, -1.089960,
+-1.198532, -1.718877, -1.377757, -1.320685,
+-1.170544, -1.394829, -2.487743, -1.024909,
+-1.458147, -1.469177, -1.273391, -1.357368,
+-1.619607, -1.609438, -1.291444, -1.117336,
+-1.269075, -1.653270, -1.317379, -1.348424,
+-1.209810, -1.311310, -2.517036, -1.045315,
+-1.343662, -1.577716, -1.288453, -1.358801,
+-1.616082, -1.612755, -1.301272, -1.109265,
+-1.180224, -1.806762, -1.359997, -1.302161,
+-1.198376, -1.325505, -2.567759, -1.032993,
+-1.372981, -1.521742, -1.278602, -1.386727,
+-1.677430, -1.615126, -1.305867, -1.069024,
+-1.261931, -1.705137, -1.335934, -1.301097,
+-1.148696, -1.433362, -2.451005, -1.026559,
+-1.462535, -1.551791, -1.220966, -1.341174,
+-1.601954, -1.624906, -1.327323, -1.089509,
+-1.211231, -1.751174, -1.350265, -1.311454,
+-1.188331, -1.308918, -2.563487, -1.055200,
+-1.408767, -1.648659, -1.198291, -1.341174,
+-1.635473, -1.596188, -1.296143, -1.112040,
+-1.182386, -1.710843, -1.385902, -1.336963,
+-1.279484, -1.281224, -2.583997, -0.997907,
+-1.400794, -1.552656, -1.219045, -1.400794,
+-1.616082, -1.663873, -1.334670, -1.053614,
+-1.257500, -1.634968, -1.357948, -1.333742,
+-1.189991, -1.231398, -2.635740, -1.101940,
+-1.456475, -1.599431, -1.248835, -1.279798,
+-1.597124, -1.568551, -1.314285, -1.137846,
+-1.180544, -1.671728, -1.452397, -1.305793,
+-1.161614, -1.294320, -2.536414, -1.097256,
+-1.388047, -1.559898, -1.313689, -1.303964,
+-1.646252, -1.651474, -1.318999, -1.055384,
+-1.221098, -1.658899, -1.404459, -1.312132,
+-1.147145, -1.301788, -2.688082, -1.071771,
+-1.357766, -1.578680, -1.243163, -1.394134,
+-1.593334, -1.571217, -1.372110, -1.092462,
+-1.187524, -1.771086, -1.267879, -1.412850,
+-1.165925, -1.333248, -2.515851, -1.067253,
+-1.447831, -1.558372, -1.229349, -1.339567,
+-1.640178, -1.603137, -1.335068, -1.073968,
+-1.183615, -1.701648, -1.338188, -1.389793,
+-1.262881, -1.269684, -2.531016, -1.030963,
+-1.391844, -1.557187, -1.280168, -1.336876,
+-1.667305, -1.593831, -1.297945, -1.093605,
+-1.225439, -1.652703, -1.392248, -1.323043,
+-1.138797, -1.377988, -2.512151, -1.059534,
+-1.453866, -1.538860, -1.291811, -1.283967,
+-1.617459, -1.547796, -1.362609, -1.099945,
+-1.186863, -1.690065, -1.380473, -1.351485,
+-1.170482, -1.404489, -2.709438, -0.973612,
+-1.453283, -1.515241, -1.252084, -1.345019,
+-1.641302, -1.670389, -1.306277, -1.057647,
+-1.245380, -1.705138, -1.366216, -1.289709,
+-1.228141, -1.314402, -2.698895, -0.990883,
+-1.466782, -1.544899, -1.263487, -1.297063,
+-1.650327, -1.548813, -1.332632, -1.103432,
+-1.220773, -1.797056, -1.375383, -1.250125,
+-1.229243, -1.384448, -2.538050, -0.972851,
+-1.386294, -1.581847, -1.275443, -1.327657,
+-1.599243, -1.659670, -1.354466, -1.050783,
+-1.251980, -1.749200, -1.348249, -1.271344,
+-1.207467, -1.340998, -2.620789, -1.003166,
+-1.422861, -1.586367, -1.320298, -1.247391,
+-1.646544, -1.656284, -1.297003, -1.069767,
+-1.175943, -1.715872, -1.408255, -1.320148,
+-1.099103, -1.391213, -2.513778, -1.087390,
+-1.368057, -1.556940, -1.357783, -1.282337,
+-1.582868, -1.609125, -1.305144, -1.128807,
+-1.189052, -1.731376, -1.347905, -1.352219,
+-1.199483, -1.336449, -2.646801, -1.007871,
+-1.387193, -1.607643, -1.276631, -1.306044,
+-1.611761, -1.585530, -1.337457, -1.099275,
+-1.173668, -1.760708, -1.386294, -1.313616,
+-1.168154, -1.397877, -2.644832, -0.991765,
+-1.473749, -1.497233, -1.243282, -1.351891,
+-1.645721, -1.550411, -1.321429, -1.114096,
+-1.197093, -1.758327, -1.349090, -1.323810,
+-1.188033, -1.340118, -2.607426, -1.022650,
+-1.411782, -1.551544, -1.320102, -1.282790,
+-1.609438, -1.582039, -1.340309, -1.100575,
+-1.226956, -1.769545, -1.325442, -1.306669,
+-1.217049, -1.229973, -2.595349, -1.087740,
+-1.364081, -1.534176, -1.249623, -1.418246,
+-1.631705, -1.580080, -1.354424, -1.077782,
+-1.230299, -1.688045, -1.363447, -1.319995,
+-1.209053, -1.330314, -2.501159, -1.035255,
+-1.433944, -1.471472, -1.277428, -1.373211,
+-1.628974, -1.677764, -1.312377, -1.055844,
+-1.183717, -1.700598, -1.355487, -1.372528,
+-1.156130, -1.309038, -2.551419, -1.086938,
+-1.335265, -1.532412, -1.281550, -1.413498,
+-1.663939, -1.633480, -1.390534, -1.003983,
+-1.234816, -1.732189, -1.298394, -1.349981,
+-1.210419, -1.308098, -2.665255, -1.016077,
+-1.416106, -1.543009, -1.311864, -1.293576,
+-1.650438, -1.641899, -1.310541, -1.064941,
+-1.174096, -1.728264, -1.353851, -1.366235,
+-1.215146, -1.332528, -2.722439, -0.984009,
+-1.366741, -1.561881, -1.347985, -1.288952,
+-1.641852, -1.599500, -1.305855, -1.098290,
+-1.518318, 0.000100, 0.000100, 0.000100,
+-1.650989, 0.000100, 0.000100, 0.000100,
+-1.228402, 0.000100, 0.000100, 0.000100,
+-1.216383, 0.000100, 0.000100, 0.000100,
+-1.156200, -1.788296, -1.501707, -1.219676,
+-1.085152, -1.409175, -2.928513, -1.009700,
+-1.359202, -1.665857, -1.398529, -1.180433,
+-1.588399, -1.610113, -1.532489, -0.967886,
+-1.195646, -1.629226, -1.437924, -1.331844,
+-1.045906, -1.402171, -2.878129, -1.060378,
+-1.304552, -1.549151, -1.341843, -1.366829,
+-1.600224, -1.601107, -1.285358, -1.139672,
+-1.175097, -1.835931, -1.409790, -1.246354,
+-1.099655, -1.419533, -2.840918, -1.002897,
+-1.366974, -1.666650, -1.447375, -1.136135,
+-1.574024, -1.622230, -1.372203, -1.073574,
+-1.158644, -1.718377, -1.426295, -1.322253,
+-0.994002, -1.441001, -2.797775, -1.101803,
+-1.285864, -1.604010, -1.342650, -1.341906,
+-1.551531, -1.705969, -1.221082, -1.166133,
+-1.173539, -1.749182, -1.426353, -1.285074,
+-1.020789, -1.462937, -2.878790, -1.044319,
+-1.337747, -1.600078, -1.440283, -1.207906,
+-1.534570, -1.623598, -1.400524, -1.076472,
+-1.155850, -1.752763, -1.407940, -1.319500,
+-1.056948, -1.422322, -2.716046, -1.063643,
+-1.242005, -1.579490, -1.397563, -1.355118,
+-1.566043, -1.664274, -1.253461, -1.151105,
+-1.178903, -1.767702, -1.417641, -1.275203,
+-1.017385, -1.426898, -2.732214, -1.098612,
+-1.337898, -1.663610, -1.408090, -1.192244,
+-1.532042, -1.609303, -1.347952, -1.126681,
+-1.170419, -1.682621, -1.438948, -1.321631,
+-1.011322, -1.395969, -2.827050, -1.110233,
+-1.257048, -1.544841, -1.414122, -1.350840,
+-1.561129, -1.634762, -1.267677, -1.159581,
+-1.181508, -1.744166, -1.413459, -1.290645,
+-1.047779, -1.452279, -2.863266, -1.026760,
+-1.353832, -1.608056, -1.445722, -1.184502,
+-1.569288, -1.631044, -1.341875, -1.094442,
+-1.157210, -1.695319, -1.385427, -1.378802,
+-1.019484, -1.460449, -2.739130, -1.071600,
+-1.299219, -1.553907, -1.398654, -1.313394,
+-1.571478, -1.611526, -1.256183, -1.177849,
+-1.173777, -1.681616, -1.470781, -1.291042,
+-1.062049, -1.393770, -2.872693, -1.051055,
+-1.333589, -1.613382, -1.449718, -1.195274,
+-1.517493, -1.630471, -1.375772, -1.101846,
+-1.151812, -1.722895, -1.421518, -1.331653,
+-1.069317, -1.440340, -2.777490, -1.028017,
+-1.310492, -1.545073, -1.396636, -1.310859,
+-1.573239, -1.645737, -1.280487, -1.133653,
+-1.163424, -1.716389, -1.419252, -1.324347,
+-1.054260, -1.390561, -2.805094, -1.072614,
+-1.304788, -1.619053, -1.426564, -1.235976,
+-1.556064, -1.579214, -1.349738, -1.128279,
+-1.134084, -1.752204, -1.418147, -1.336631,
+-1.046328, -1.371239, -2.738557, -1.107697,
+-1.262807, -1.569511, -1.399226, -1.338624,
+-1.561983, -1.634813, -1.261913, -1.164177,
+-1.149102, -1.730105, -1.435239, -1.317684,
+-1.048281, -1.411939, -2.773963, -1.068930,
+-1.344283, -1.624986, -1.400823, -1.217514,
+-1.558488, -1.576909, -1.360161, -1.119896,
+-1.143615, -1.740395, -1.407704, -1.342547,
+-1.024624, -1.417717, -2.743840, -1.095181,
+-1.273648, -1.580273, -1.391559, -1.325821,
+-1.551185, -1.647419, -1.288679, -1.139989,
+-1.158992, -1.702199, -1.438096, -1.322245,
+-1.023658, -1.402571, -2.801916, -1.096422,
+-1.304857, -1.576896, -1.427636, -1.264792,
+-1.521862, -1.661267, -1.343430, -1.106098,
+-1.138200, -1.741679, -1.427852, -1.329704,
+-1.047245, -1.390151, -2.798486, -1.081292,
+-1.285624, -1.535460, -1.407041, -1.334466,
+-1.545106, -1.636580, -1.317879, -1.125890,
+-1.176447, -1.702330, -1.425002, -1.313561,
+-1.063059, -1.399984, -2.892036, -1.042596,
+-1.309093, -1.619154, -1.422325, -1.235408,
+-1.547213, -1.593763, -1.348497, -1.125871,
+-1.132469, -1.789614, -1.419485, -1.313414,
+-1.019486, -1.410987, -2.779834, -1.098807,
+-1.279151, -1.548954, -1.395255, -1.341371,
+-1.575319, -1.631627, -1.310876, -1.115158,
+-1.153088, -1.737641, -1.409965, -1.330849,
+-1.056724, -1.368546, -2.866388, -1.075795,
+-1.298011, -1.601275, -1.388999, -1.287493,
+-1.528927, -1.623465, -1.369626, -1.103176,
+-1.128337, -1.735637, -1.420318, -1.352845,
+-1.008316, -1.398550, -2.865152, -1.104919,
+-1.251055, -1.556816, -1.404598, -1.356637,
+-1.541305, -1.628064, -1.304234, -1.145047,
+-1.175293, -1.672557, -1.429697, -1.331338,
+-1.016490, -1.432441, -2.816684, -1.079964,
+-1.287691, -1.621387, -1.433402, -1.244891,
+-1.524094, -1.624874, -1.376315, -1.100400,
+-1.158464, -1.704643, -1.434707, -1.324223,
+-1.050512, -1.430522, -2.728158, -1.062132,
+-1.249473, -1.557418, -1.434639, -1.330052,
+-1.585114, -1.580638, -1.321346, -1.131790,
+-1.164100, -1.728435, -1.402215, -1.331113,
+-1.059808, -1.372555, -2.796179, -1.081825,
+-1.300133, -1.559798, -1.450537, -1.262727,
+-1.550550, -1.617361, -1.348561, -1.109167,
+-1.154306, -1.698963, -1.438007, -1.330101,
+-1.007274, -1.425422, -2.771039, -1.103274,
+-1.289683, -1.534921, -1.398789, -1.338369,
+-1.521240, -1.633265, -1.298097, -1.160858,
+-1.159115, -1.712770, -1.421435, -1.329901,
+-1.042185, -1.415047, -2.789384, -1.070190,
+-1.257303, -1.572847, -1.430407, -1.313361,
+-1.566241, -1.601882, -1.334236, -1.119954,
+-1.152726, -1.704643, -1.405726, -1.357905,
+-1.049265, -1.398059, -2.819276, -1.069792,
+-1.279642, -1.585487, -1.385914, -1.320806,
+-1.544011, -1.637666, -1.306899, -1.135113,
+-1.161998, -1.691435, -1.434283, -1.329543,
+-1.055501, -1.391015, -2.890764, -1.056629,
+-1.308657, -1.611665, -1.400201, -1.259785,
+-1.552974, -1.599023, -1.364183, -1.106546,
+-1.137087, -1.730976, -1.404839, -1.359731,
+-1.048713, -1.406299, -2.866579, -1.056502,
+-1.301594, -1.613486, -1.362064, -1.299828,
+-1.565280, -1.617398, -1.295468, -1.143234,
+-1.152845, -1.701040, -1.439481, -1.329089,
+-1.041800, -1.414177, -2.781930, -1.072546,
+-1.294359, -1.615712, -1.366353, -1.301431,
+-1.523239, -1.588825, -1.381001, -1.119316,
+-1.153405, -1.744755, -1.396023, -1.338754,
+-1.035950, -1.395026, -2.850257, -1.080414,
+-1.247154, -1.600895, -1.389541, -1.340277,
+-1.523609, -1.631927, -1.309119, -1.150540,
+-1.132256, -1.747000, -1.444114, -1.318847,
+-1.050389, -1.407550, -2.727585, -1.078576,
+-1.276440, -1.564233, -1.411729, -1.316764,
+-1.518904, -1.645848, -1.372724, -1.094271,
+-1.128775, -1.765789, -1.397534, -1.353583,
+-1.030513, -1.369269, -2.794565, -1.115529,
+-1.282375, -1.556538, -1.402843, -1.324784,
+-1.544759, -1.628896, -1.305972, -1.140743,
+-1.119645, -1.730341, -1.433179, -1.355451,
+-1.061603, -1.398643, -2.750161, -1.069297,
+-1.257066, -1.591139, -1.432208, -1.298145,
+-1.561296, -1.586714, -1.361100, -1.111209,
+-1.138677, -1.770062, -1.397005, -1.339034,
+-1.054093, -1.406581, -2.746608, -1.071872,
+-1.280665, -1.596271, -1.351932, -1.344167,
+-1.526404, -1.610039, -1.334293, -1.141123,
+-1.142144, -1.671001, -1.464105, -1.341236,
+-1.049070, -1.395982, -2.712133, -1.091383,
+-1.306748, -1.563136, -1.359548, -1.335425,
+-1.540016, -1.595523, -1.375589, -1.108271,
+-1.131113, -1.759395, -1.440197, -1.315664,
+-1.030385, -1.402887, -2.793080, -1.090608,
+-1.247253, -1.560457, -1.393368, -1.368732,
+-1.522937, -1.618093, -1.338765, -1.134808,
+-1.141393, -1.776498, -1.426213, -1.304974,
+-1.065803, -1.371063, -2.785661, -1.078740,
+-1.308561, -1.582943, -1.397689, -1.282821,
+-1.519913, -1.613985, -1.351621, -1.129001,
+-1.147093, -1.701519, -1.430650, -1.343688,
+-1.043280, -1.397598, -2.758296, -1.087337,
+-1.278849, -1.569831, -1.353456, -1.365644,
+-1.563026, -1.579930, -1.328687, -1.140420,
+-1.147734, -1.734766, -1.425837, -1.324650,
+-1.015115, -1.428704, -2.838127, -1.080326,
+-1.277021, -1.601112, -1.404764, -1.294495,
+-1.546907, -1.601277, -1.374686, -1.101005,
+-1.136711, -1.730971, -1.422518, -1.343584,
+-1.052933, -1.396624, -2.832028, -1.064898,
+-1.266755, -1.593073, -1.372877, -1.341021,
+-1.546281, -1.616671, -1.335252, -1.122974,
+-1.124485, -1.750968, -1.437326, -1.331760,
+-1.009070, -1.425936, -2.796789, -1.096155,
+-1.292527, -1.549514, -1.407414, -1.315660,
+-1.513669, -1.615169, -1.368900, -1.118849,
+-1.136501, -1.756182, -1.431777, -1.318737,
+-1.047698, -1.432997, -2.782393, -1.053334,
+-1.261736, -1.590033, -1.393524, -1.329083,
+-1.508654, -1.633979, -1.349313, -1.126325,
+-1.139590, -1.768739, -1.423105, -1.314766,
+-1.044402, -1.420913, -2.811372, -1.059924,
+-1.268332, -1.590288, -1.403405, -1.312769,
+-1.535330, -1.559744, -1.386494, -1.125545,
+-1.126691, -1.762346, -1.415594, -1.341424,
+-1.035708, -1.437582, -2.731503, -1.071768,
+-1.262056, -1.571996, -1.388213, -1.347929,
+-1.527746, -1.603740, -1.364278, -1.120038,
+-1.128189, -1.740604, -1.439334, -1.332256,
+-1.037474, -1.384311, -2.851044, -1.086572,
+-1.264251, -1.581786, -1.419796, -1.308673,
+-1.536749, -1.609101, -1.377915, -1.100364,
+-1.129624, -1.745882, -1.423683, -1.341211,
+-1.077492, -1.360869, -2.718186, -1.087318,
+-1.292844, -1.560837, -1.393002, -1.319675,
+-1.542755, -1.599767, -1.350726, -1.123275,
+-1.122939, -1.719909, -1.438148, -1.353912,
+-1.019076, -1.408987, -2.799644, -1.097066,
+-1.264904, -1.574877, -1.430926, -1.303377,
+-1.528319, -1.582163, -1.360264, -1.136386,
+-1.127941, -1.754431, -1.435112, -1.327246,
+-1.057786, -1.354909, -2.791271, -1.098225,
+-1.277865, -1.523990, -1.429649, -1.331297,
+-1.528063, -1.592185, -1.356935, -1.132845,
+-1.118198, -1.734416, -1.456629, -1.333292,
+-1.054306, -1.392925, -2.771918, -1.076823,
+-1.243939, -1.563312, -1.436878, -1.329371,
+-1.538914, -1.588965, -1.373533, -1.114625,
+-1.114174, -1.762750, -1.413050, -1.359296,
+-1.030937, -1.386101, -2.806270, -1.100066,
+-1.279643, -1.572263, -1.376749, -1.339828,
+-1.539308, -1.620469, -1.332956, -1.127110,
+-1.132293, -1.756575, -1.430088, -1.325073,
+-1.044530, -1.405092, -2.829869, -1.067760,
+-1.263073, -1.542070, -1.441361, -1.321732,
+-1.569515, -1.584682, -1.364866, -1.104350,
+-1.126733, -1.735999, -1.422784, -1.352304,
+-1.035509, -1.425352, -2.728281, -1.081169,
+-1.266783, -1.567128, -1.427648, -1.310268,
+-1.526242, -1.615034, -1.346828, -1.127934,
+-1.135978, -1.717888, -1.439778, -1.337629,
+-1.031355, -1.456076, -2.831441, -1.045832,
+-1.264621, -1.579378, -1.408499, -1.320355,
+-1.544289, -1.589774, -1.365707, -1.116681,
+-1.113647, -1.739169, -1.452574, -1.339383,
+-1.036056, -1.380603, -2.801541, -1.099597,
+-1.240759, -1.580230, -1.399468, -1.354101,
+-1.536242, -1.649324, -1.323536, -1.119510,
+-1.145446, -1.737165, -1.406948, -1.343198,
+-1.042674, -1.384714, -2.848554, -1.081267,
+-1.282963, -1.565872, -1.415031, -1.305778,
+-1.544226, -1.589703, -1.367956, -1.115016,
+-1.138113, -1.731687, -1.420056, -1.343653,
+-1.045275, -1.402423, -2.824428, -1.069844,
+-1.295123, -1.544398, -1.388588, -1.334581,
+-1.526150, -1.596662, -1.348254, -1.138296,
+-1.130550, -1.726069, -1.440548, -1.338023,
+-1.046438, -1.398475, -2.814218, -1.073270,
+-1.255229, -1.590107, -1.428286, -1.304286,
+-1.526205, -1.620987, -1.340161, -1.129688,
+-1.150000, -1.738616, -1.424460, -1.320651,
+-1.051213, -1.411066, -2.821005, -1.058266,
+-1.309351, -1.514259, -1.400509, -1.333466,
+-1.508507, -1.618113, -1.363914, -1.124456,
+-1.134824, -1.757423, -1.426076, -1.325076,
+-1.043255, -1.417321, -2.803214, -1.065032,
+-1.279713, -1.532647, -1.383815, -1.365321,
+-1.519064, -1.587143, -1.349879, -1.147900,
+-1.131152, -1.736176, -1.439274, -1.331632,
+-1.036649, -1.380280, -2.888622, -1.084123,
+-1.293743, -1.515937, -1.391231, -1.357177,
+-1.523053, -1.627413, -1.370635, -1.103910,
+-1.108869, -1.748558, -1.431883, -1.357962,
+-1.061592, -1.384073, -2.915935, -1.051578,
+-1.237868, -1.599547, -1.396187, -1.345264,
+-1.518417, -1.614892, -1.366809, -1.117458,
+-1.135170, -1.743547, -1.433209, -1.327303,
+-1.032677, -1.404282, -2.782984, -1.089006,
+-1.276197, -1.584373, -1.355692, -1.354570,
+-1.537405, -1.581161, -1.348853, -1.140098,
+-1.119314, -1.766928, -1.440377, -1.325008,
+-1.031262, -1.397101, -2.773566, -1.097538,
+-1.247002, -1.611318, -1.417301, -1.306953,
+-1.558331, -1.618170, -1.352569, -1.100584,
+-1.127350, -1.745691, -1.426519, -1.341543,
+-1.013821, -1.410808, -2.826955, -1.096522,
+-1.246982, -1.565019, -1.406036, -1.353121,
+-1.546611, -1.609172, -1.353099, -1.113097,
+-1.106069, -1.768911, -1.425146, -1.354210,
+-1.041831, -1.416955, -2.763873, -1.073840,
+-1.267899, -1.586226, -1.391059, -1.327764,
+-1.528048, -1.581214, -1.378274, -1.122999,
+-1.097651, -1.755814, -1.460312, -1.341643,
+-1.042227, -1.411974, -2.852335, -1.061431,
+-1.257354, -1.559004, -1.404205, -1.348278,
+-1.539860, -1.633473, -1.350901, -1.104635,
+-1.117918, -1.750674, -1.448205, -1.330325,
+-1.037544, -1.390667, -2.754117, -1.099307,
+-1.252474, -1.609727, -1.405165, -1.313288,
+-1.544736, -1.598525, -1.341162, -1.130426,
+-1.115304, -1.712051, -1.448541, -1.359550,
+-1.043741, -1.401096, -2.844219, -1.068980,
+-1.271704, -1.585130, -1.402577, -1.313925,
+-1.537842, -1.601314, -1.357195, -1.120420,
+-1.118869, -1.711025, -1.448926, -1.355384,
+-1.055699, -1.397169, -2.892420, -1.051787,
+-1.268116, -1.602063, -1.395080, -1.311761,
+-1.534456, -1.595880, -1.366850, -1.118442,
+-1.122673, -1.746898, -1.441286, -1.333118,
+-1.041974, -1.407369, -2.817838, -1.070839,
+-1.288822, -1.542899, -1.387650, -1.343299,
+-1.511921, -1.608170, -1.333747, -1.152735,
+-1.115062, -1.751051, -1.457413, -1.325479,
+-1.023172, -1.415943, -2.803849, -1.086859,
+-1.247188, -1.612570, -1.393999, -1.327148,
+-1.533530, -1.625640, -1.360522, -1.105894,
+-1.116745, -1.757243, -1.436452, -1.338012,
+-1.045930, -1.409458, -2.877161, -1.055365,
+-1.263699, -1.558217, -1.401522, -1.344537,
+-1.537866, -1.617753, -1.335388, -1.127749,
+-1.108063, -1.732413, -1.449533, -1.353768,
+-1.042162, -1.399918, -2.825833, -1.074606,
+-1.284279, -1.552279, -1.421440, -1.309263,
+-1.542621, -1.605882, -1.357213, -1.114466,
+-1.119742, -1.716693, -1.439762, -1.358708,
+-1.044607, -1.383184, -2.799425, -1.089021,
+-1.257743, -1.590896, -1.395404, -1.330938,
+-1.526378, -1.632356, -1.339031, -1.123590,
+-1.126907, -1.782094, -1.445095, -1.302111,
+-1.023461, -1.422535, -2.841483, -1.075257,
+-1.252573, -1.602432, -1.389148, -1.333558,
+-1.510365, -1.626385, -1.363120, -1.118804,
+-1.121348, -1.743086, -1.448594, -1.330747,
+-1.053977, -1.423663, -2.819205, -1.047062,
+-1.260617, -1.553619, -1.412063, -1.341705,
+-1.532467, -1.615983, -1.347069, -1.123004,
+-1.118920, -1.773352, -1.416841, -1.342753,
+-1.036011, -1.408534, -2.776839, -1.083538,
+-1.241985, -1.598424, -1.402658, -1.335479,
+-1.557092, -1.603839, -1.371762, -1.095209};
+
diff --git a/libsim4/sim4core/sites_score.H b/libsim4/sim4core/sites_score.H
new file mode 100644
index 0000000..1e3924a
--- /dev/null
+++ b/libsim4/sim4core/sites_score.H
@@ -0,0 +1,13 @@
+#ifndef SITES_SCORE_H
+#define SITES_SCORE_H
+
+/* DO NOT REMOVE or MODIFY !!!! */
+
+#define NUM_VALUES_SCORES 2560
+
+extern double score_ex_acc[NUM_VALUES_SCORES];
+extern double score_in_acc[NUM_VALUES_SCORES];
+extern double score_ex_don[NUM_VALUES_SCORES];
+extern double score_in_don[NUM_VALUES_SCORES];
+
+#endif /* SITES_SCORE_H */
diff --git a/libsim4/sim4core/splice.C b/libsim4/sim4core/splice.C
new file mode 100644
index 0000000..45db0bf
--- /dev/null
+++ b/libsim4/sim4core/splice.C
@@ -0,0 +1,791 @@
+#include <pthread.h>
+#include "sim4.H"
+
+#define GENESPLICER_SPAN 80
+#define GLIMMER_XSPAN 30
+#define GLIMMER_ISPAN 20
+#define GLIMMER_SPAN 30
+#define S4_SPAN 0
+/* #define MAX_SPAN 80 Now defined in sim4.H */
+
+static int spl_encode[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, [...]
+static int rev_compl[256] = { 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 71, 84, 84, 84, 67, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 65, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 71, 84, 84, 84, 67, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, [...]
+static int spliceInit = 0;
+
+int const gt[5][5] = {{0, 0, 0, 2, 0},
+ {0, 0, 0, 2, 0},
+ {2, 3, 2, 5, 2},
+ {0, 0, 0, 2, 0},
+ {0, 0, 0, 2, 0}};
+int const ct[5][5] = {{0, 0, 0, 2, 0},
+ {2, 2, 2, 5, 2},
+ {0, 0, 0, 2, 0},
+ {0, 0, 0, 2, 0},
+ {0, 0, 0, 2, 0}};
+int const ag[5][5] = {{2, 2, 5, 2, 2},
+ {0, 0, 2, 0, 0},
+ {0, 0, 2, 0, 0},
+ {0, 0, 2, 0, 0},
+ {0, 0, 2, 0, 0}};
+int const ac[5][5] = {{2, 5, 2, 2, 2},
+ {0, 2, 0, 0, 0},
+ {0, 3, 0, 0, 0},
+ {0, 2, 0, 0, 0},
+ {0, 2, 0, 0, 0}};
+
+
+#if 0
+int const gt[4][4] = {{0, 0, 0, 2},{0, 0, 0, 2},{2, 2, 2, 5},{0, 0, 0, 2}};
+int const ct[4][4] = {{0, 0, 0, 2},{2, 2, 2, 5},{0, 0, 0, 2},{0, 0, 0, 2}};
+int const ag[4][4] = {{2, 2, 5, 2},{0, 0, 2, 0},{0, 0, 2, 0},{0, 0, 2, 0}};
+int const ac[4][4] = {{2, 5, 2, 2},{0, 2, 0, 0},{0, 2, 0, 0},{0, 2, 0, 0}};
+#endif
+
+
+/* GLIMMER functions - move to glimmer.h? */
+
+static char Glimmer_TRAIN_DIR[] = "./GlimmerModels/";
+static char Glimmer_posDonModelPath[] = "donors.162.pos.icm";
+static char Glimmer_negDonModelPath[] = "donors.162.neg.icm";
+static char Glimmer_posAccModelPath[] = "acceptors.162.pos.icm";
+static char Glimmer_negAccModelPath[] = "acceptors.162.neg.icm";
+
+struct Fixed_Length_ICM_t donor_pos_model, donor_neg_model;
+struct Fixed_Length_ICM_t acceptor_pos_model, acceptor_neg_model;
+int donor_pos_model_len, donor_neg_model_len;
+int acceptor_pos_model_len, acceptor_neg_model_len;
+int initGlimmerModel = 0;
+
+void
+Sim4::loadGlimmerModel (char *train_dir)
+{
+ char filename[1000];
+
+ if (initGlimmerModel)
+ return;
+
+ /* LLL is this still needed? Yes, since it is initialized in the class Sim4*/
+
+ sprintf(filename, "%s/%s", train_dir, Glimmer_posDonModelPath);
+ readModel (&donor_pos_model, filename);
+
+ sprintf(filename, "%s/%s", train_dir, Glimmer_negDonModelPath);
+ readModel (&donor_neg_model, filename);
+
+ sprintf(filename, "%s/%s", train_dir, Glimmer_posAccModelPath);
+ readModel (&acceptor_pos_model, filename);
+
+ sprintf(filename, "%s/%s", train_dir, Glimmer_negAccModelPath);
+ readModel (&acceptor_neg_model, filename);
+
+ donor_pos_model_len = getModelLength (donor_pos_model);
+ donor_neg_model_len = getModelLength (donor_neg_model);
+ acceptor_pos_model_len = getModelLength (acceptor_pos_model);
+ acceptor_neg_model_len = getModelLength (acceptor_neg_model);
+
+ if (donor_pos_model_len!=donor_neg_model_len)
+ fatal ("ERROR: Positive and negative donor model lengths differ\n");
+ if (acceptor_pos_model_len!=acceptor_neg_model_len)
+ fatal ("ERROR: Positive and negative acceptor model lengths differ\n");
+
+ initGlimmerModel = 1;
+}
+
+double
+Sim4::ScoreDonor_Glimmer (char *asegment, char *train_dir)
+{
+ double pos_score, neg_score, diff;
+
+ pos_score = Score_Window (donor_pos_model, asegment, GLIMMER_XSPAN);
+ neg_score = Score_Window (donor_neg_model, asegment, GLIMMER_XSPAN);
+ diff = pos_score - neg_score;
+
+// printf ("%s %9.5f %9.5f %9.5f\n", string, pos_score, neg_score, diff);
+
+ return diff;
+}
+
+double
+Sim4::ScoreAcceptor_Glimmer (char *asegment, char *train_dir)
+{
+ double pos_score, neg_score, diff;
+
+ pos_score = Score_Window (acceptor_pos_model, asegment, GLIMMER_ISPAN);
+ neg_score = Score_Window (acceptor_neg_model, asegment, GLIMMER_ISPAN);
+ diff = pos_score - neg_score;
+
+// printf ("%s %9.5f %9.5f %9.5f\n", string, pos_score, neg_score, diff);
+
+ return diff;
+}
+
+
+/* Generic splice scoring functions: new_splice(), splice_donor(), splice_donor_uni(),
+ splice_acceptor(), splice_acceptor_uni(), splice_init() */
+
+
+Sim4::splice_t *
+Sim4::new_splice(char c, int xs, int xe, int ys, int ye, double score, splice_t *next)
+{
+ splice_t *sp = (splice_t *)ckalloc(sizeof(splice_t));
+
+ sp->type = c; sp->xs = xs; sp->xe = xe;
+ sp->ys = ys; sp->ye = ye; sp->score = score;
+ sp->next = next;
+
+ return sp;
+}
+
+void
+Sim4::splice_init(int spl_model)
+{
+ if (spliceInit)
+ return;
+
+#if 0
+ // Enable this to generate the spl_encode and rev_compl data
+ // initialized at the top of this file.
+
+ for (int i=0; i<256; spl_encode[i]=0, rev_compl[i]='T', i++)
+ ;
+
+ spl_encode[(int)'A'] = spl_encode[(int)'a'] = 0;
+ spl_encode[(int)'C'] = spl_encode[(int)'c'] = 1;
+ spl_encode[(int)'G'] = spl_encode[(int)'g'] = 2;
+ spl_encode[(int)'T'] = spl_encode[(int)'t'] = 3;
+
+ rev_compl[(int)'A'] = rev_compl[(int)'a'] = 'T';
+ rev_compl[(int)'C'] = rev_compl[(int)'c'] = 'G';
+ rev_compl[(int)'G'] = rev_compl[(int)'g'] = 'C';
+ rev_compl[(int)'T'] = rev_compl[(int)'t'] = 'A';
+
+ for (int i=0; i<256; i++)
+ fprintf(stdout, "%2d, ", spl_encode[i]);
+ fprintf(stdout, "\n");
+ for (int i=0; i<256; i++)
+ fprintf(stdout, "%2d, ", rev_compl[i]);
+ fprintf(stdout, "\n");
+
+ exit(1);
+#endif
+
+ if ((spl_model != SPLICE_GENESPLICER) &&
+ (spl_model != SPLICE_GLIMMER)) {
+ spliceInit = 1;
+ return;
+ }
+
+ // This really needs to be moved out of the Sim4 class. Sim4 should take as a parameter
+ // the model to use, which should be initialized by the client -- before it starts doing
+ // any sim4 work.
+
+ if (spliceInit == 1)
+ // Data already loaded, no need to involve a mutex here.
+ return;
+
+ pthread_mutex_lock(&(globalParams->_splice_mutex));
+
+ // If after getting the mutex the data still isn't loaded, load it. Otherwise, someone
+ // already loaded the data for us and we just exit.
+
+ if (spliceInit == 0) {
+ if (spl_model == SPLICE_GENESPLICER)
+ loadGeneSplicerModel();
+
+ if (spl_model == SPLICE_GLIMMER)
+ loadGlimmerModel(Glimmer_TRAIN_DIR);
+
+ spliceInit = 1;
+ }
+
+ pthread_mutex_unlock(&(globalParams->_splice_mutex));
+}
+
+void
+Sim4::splice_donor(char *xseq, char *yseq, int M, int N, double *gt_score,
+ double *ct_score, double **max_Gf, double **max_Cf,
+ int **start_Gi, int **start_Ci)
+{
+ int *CCf, *Xt;
+ double *mG, *mC, tmpf;
+ int *sC, *sG;
+ int i, j, tmpi, ss, ssx, cx, c;
+ char *s, *t;
+
+ CCf = (int *)ckalloc((M+1)*sizeof(int));
+ Xt = (int *)ckalloc((M+1)*sizeof(int));
+ mG = *max_Gf = (double *)ckalloc((N+1)*sizeof(double));
+ sG = *start_Gi = (int *)ckalloc((N+1)*sizeof(int));
+ mC = *max_Cf = (double *)ckalloc((N+1)*sizeof(double));
+ sC = *start_Ci = (int *)ckalloc((N+1)*sizeof(int));
+
+ t = yseq; Xt[0] = CCf[0] = 0;
+ for (j=1; j<=M; j++) { CCf[j] = j; Xt[j] = 0; }
+
+ mG[0] = mC[0] = -999999;
+ for (j=0; j<=M; j++) {
+ if ((100*gt_score[j])>mG[0]) { mG[0] = 100*gt_score[j]; sG[0] = j; }
+ if ((100*ct_score[j])>mC[0]) { mC[0] = 100*ct_score[j]; sC[0] = j; }
+ }
+
+ for (i=1; i<=N; i++, t++) {
+ s = xseq;
+ ss = CCf[0]; ssx = Xt[0];
+ c = ++CCf[0]; cx = Xt[0];
+ for (j=1; j<=M; j++, s++) {
+ tmpi=MIN(MIN(CCf[j]+1, ss+(*t!=*s)),c+1);
+ if (tmpi==c+1);
+ else if (tmpi==CCf[j]+1) cx = Xt[j];
+ else cx = ssx + (*t==*s);
+ c = tmpi; ss = CCf[j]; CCf[j] = c; ssx = Xt[j]; Xt[j] = cx;
+ }
+
+ /* compute max_Gf and max_Cf */
+ mG[i] = mC[i] = -999999;
+ for (j=0; j<=M; j++) {
+ assert(Xt[j]+CCf[j]!=0);
+ tmpf = (int)(stepct(j)*Xt[j]/(double)(Xt[j]+CCf[j])*100);
+ if ((tmpf+100*gt_score[j])>mG[i]) {
+ mG[i] = tmpf+100*gt_score[j]; sG[i] = j;
+#if 0
+ fprintf(stderr, "%2d: mG[i]=%1.6f tmpf=%1.6f gt_score[%2d]=%1.6f\n",
+ i, mG[i], tmpf, j, gt_score[j]);
+#endif
+ }
+ if ((tmpf+100*ct_score[j])>mC[i]) {
+ mC[i] = tmpf+100*ct_score[j]; sC[i] = j;
+ }
+ }
+ }
+ ckfree(CCf);
+ ckfree(Xt);
+}
+
+void
+Sim4::splice_donor_uni(char *xseq, char *yseq, int M, int N,
+ double *It_score, double **max_If, int **start_Ii)
+{
+ int *CCf, *Xt, tmpi;
+ double *mI, tmpf;
+ int *sI;
+ int i, j, ss, ssx, cx, c;
+ char *s, *t;
+
+ CCf = (int *)ckalloc((M+1)*sizeof(int));
+ Xt = (int *)ckalloc((M+1)*sizeof(int));
+ mI = *max_If = (double *)ckalloc((N+1)*sizeof(double));
+ sI = *start_Ii = (int *)ckalloc((N+1)*sizeof(int));
+
+ t = yseq; Xt[0] = CCf[0] = 0;
+ for (j=1; j<=M; j++) { CCf[j] = j; Xt[j] = 0; }
+
+ mI[0] = -999999;
+ for (j=0; j<=M; j++)
+ if ((100*It_score[j])>mI[0]) { mI[0] = 100*It_score[j]; sI[0] = j; }
+
+ for (i=1; i<=N; i++, t++) {
+ s = xseq;
+ ss = CCf[0]; ssx = Xt[0];
+ c = ++CCf[0]; cx = Xt[0];
+ for (j=1; j<=M; j++, s++) {
+ tmpi=MIN(MIN(CCf[j]+1, ss+(*t!=*s)),c+1);
+ if (tmpi==c+1);
+ else if (tmpi==CCf[j]+1) cx = Xt[j];
+ else cx = ssx + (*t==*s);
+ c = tmpi; ss = CCf[j]; CCf[j] = c; ssx = Xt[j]; Xt[j] = cx;
+ }
+
+ /* compute max_If */
+ mI[i] = -999999;
+ for (j=0; j<=M; j++) {
+ assert(Xt[j]+CCf[j]!=0);
+ tmpf = (int)(stepct(j)*Xt[j]/(double)(Xt[j]+CCf[j])*100)+100*It_score[j];
+ if (tmpf>mI[i]) {
+ mI[i] = tmpf; sI[i] = j;
+ }
+ }
+ }
+ ckfree(CCf); ckfree(Xt);
+}
+
+
+void
+Sim4::splice_acceptor(char *xseq, char *yseq, int M, int N,
+ double *ag_score, double *ac_score, double **max_Gb,
+ double **max_Cb, int **end_Gi, int **end_Ci)
+{
+ int *CCb, *Xt;
+ double *mC, *mG, tmpf;
+ int *eC, *eG;
+ int tmpi, i, j, ss, ssx, cx, c;
+ char *t, *s;
+
+ CCb = (int *)ckalloc((M+1)*sizeof(int));
+ Xt = (int *)ckalloc((M+1)*sizeof(int));
+ mG = *max_Gb = (double *)ckalloc((N+1)*sizeof(double));
+ eG = *end_Gi = (int *)ckalloc((N+1)*sizeof(int));
+ mC = *max_Cb = (double *)ckalloc((N+1)*sizeof(double));
+ eC = *end_Ci = (int *)ckalloc((N+1)*sizeof(int));
+
+ t = yseq+N-1; CCb[M] = Xt[M] = 0;
+ for (j=M-1; j>=0; j--) { CCb[j] = M-j; Xt[j] = 0; }
+
+ mG[N] = mC[N] = -999999;
+ for (j=M; j>=0; j--) {
+ if ((100*ag_score[j])>mG[N]) { mG[N] = 100*ag_score[j]; eG[N] = j+1; }
+ if ((100*ac_score[j])>mC[N]) { mC[N] = 100*ac_score[j]; eC[N] = j+1; }
+ }
+
+ for (i=N-1; i>=0; i--, t--) {
+ s = xseq+M-1;
+ ss = CCb[M]; ssx = Xt[M];
+ c = ++CCb[M]; cx = Xt[M];
+ for (j=M-1; j>=0; j--, s--) {
+ tmpi=MIN(MIN(CCb[j]+1, ss+(*t!=*s)),c+1);
+ if (tmpi==c+1) ;
+ else if (tmpi==CCb[j]+1) cx = Xt[j];
+ else cx = ssx + (*t==*s);
+ c = tmpi; ss = CCb[j]; CCb[j] = c; ssx = Xt[j]; Xt[j] = cx;
+ }
+
+ /* compute max_Gb and max_Cb */
+ mG[i] = -999999; mC[i] = -999999;
+ for (j=M; j>=0; j--) {
+ assert(CCb[j]+Xt[j]!=0);
+ tmpf = (int)(stepct(M-j)*Xt[j]/(double)(CCb[j]+Xt[j])*100);
+ if ((tmpf+100*ag_score[j])>mG[i]) {
+ mG[i] = tmpf+100*ag_score[j]; eG[i] = j+1;
+ }
+ if ((tmpf+100*ac_score[j])>mC[i]) {
+ mC[i] = tmpf+100*ac_score[j]; eC[i] = j+1;
+ }
+ }
+ }
+ ckfree(CCb);
+ ckfree(Xt);
+}
+
+
+void
+Sim4::splice_acceptor_uni(char *xseq, char *yseq, int M, int N,
+ double *aI_score, double **max_Ib, int **end_Ii)
+{
+ int *CCb, *Xt;
+ double *mI, tmpf;
+ int *eI;
+ int tmpi, i, j, ss, ssx, cx, c;
+ char *t, *s;
+
+
+ CCb = (int *)ckalloc((M+1)*sizeof(int));
+ Xt = (int *)ckalloc((M+1)*sizeof(int));
+ mI = *max_Ib = (double *)ckalloc((N+1)*sizeof(double));
+ eI = *end_Ii = (int *)ckalloc((N+1)*sizeof(int));
+
+ t = yseq+N-1; CCb[M] = Xt[M] = 0;
+ for (j=M-1; j>=0; j--) { CCb[j] = M-j; Xt[j] = 0; }
+
+ mI[N] = -999999;
+ for (j=M; j>=0; j--)
+ if ((100*aI_score[j])>mI[N]) { mI[N] = 100*aI_score[j]; eI[N] = j+1; }
+
+ for (i=N-1; i>=0; i--, t--) {
+ s = xseq+M-1;
+ ss = CCb[M]; ssx = Xt[M];
+ c = ++CCb[M]; cx = Xt[M];
+ for (j=M-1; j>=0; j--, s--) {
+ tmpi=MIN(MIN(CCb[j]+1, ss+(*t!=*s)),c+1);
+ if (tmpi==c+1) ;
+ else if (tmpi==CCb[j]+1) cx = Xt[j];
+ else cx = ssx + (*t==*s);
+
+ c = tmpi; ss = CCb[j]; CCb[j] = c; ssx = Xt[j]; Xt[j] = cx;
+ }
+
+ /* compute max_Ib */
+ mI[i] = -999999;
+ for (j=M; j>=0; j--) {
+ assert(CCb[j]+Xt[j]!=0);
+ tmpf = (int)(stepct(M-j)*Xt[j]/(double)(CCb[j]+Xt[j])*100)+100*aI_score[j];
+ if (tmpf>mI[i]) {
+ mI[i] = tmpf; eI[i] = j+1;
+ }
+ }
+ }
+ ckfree(CCb); ckfree(Xt);
+}
+
+
+
+void
+Sim4::splice(char *in_seqx, int ls, int us, int le, int ue,
+ char *in_seqy, int ys, int ye,
+ splice_t **gcell, splice_t **ccell, int ori, int spl_model)
+{
+ double *gtscore=NULL, *ctscore=NULL, *agscore=NULL, *acscore=NULL;
+ int i;
+ double tmpf, maxCscore, maxGscore;
+ int Gxs, Gxe, Gy, Cxs, Cxe, Cy;
+ double *max_Cf=NULL, *max_Gf=NULL, *max_Cb=NULL, *max_Gb=NULL;
+ int *start_Gi=NULL, *start_Ci=NULL, *end_Gi=NULL, *end_Ci=NULL;
+ char *nsegmentL=NULL, *nsegmentR=NULL, *asegmentL=NULL, *asegmentR=NULL;
+
+ // Initialize the encoding. This isn't quite as wonderful as
+ // it should be, as there is a chance that two different threads
+ // could initialize the encoding twice, but then again,
+ // it doesn't matter.
+ //
+
+//splice_init(spl_model); LLL
+
+ nsegmentL = (char *) ckalloc(2*MAX_SPAN + 2*MAX_SLIDE + 3);
+ nsegmentR = (char *) ckalloc(2*MAX_SPAN + 2*MAX_SLIDE + 3);
+
+ /* Obs: for Glimmer scoring, need only remember the reverse complemented
+ segments; but for now we allocate two arrays */
+ if (spl_model==SPLICE_GLIMMER) {
+ asegmentL = (char *) ckalloc(2*MAX_SPAN + 2*MAX_SLIDE + 3);
+ asegmentR = (char *) ckalloc(2*MAX_SPAN + 2*MAX_SLIDE + 3);
+ }
+
+ if (ori==FWD || ori==BOTH) {
+ gtscore = (double *)ckalloc(((us-ls+2)+(ue-le+2))*sizeof(double));
+ agscore = gtscore+(us-ls+2);
+ }
+ if (ori==BWD || ori==BOTH) {
+ ctscore = (double *)ckalloc(((us-ls+2)+(ue-le+2))*sizeof(double));
+ acscore = ctscore+(us-ls+2);
+ }
+
+ switch (spl_model) {
+ case SPLICE_ORIGINAL:
+ splice_original(in_seqx,ls,us,le,ue,in_seqy,ys,ye,gtscore,agscore,ctscore,acscore,ori,nsegmentL,nsegmentR);
+ break;
+
+ case SPLICE_GENESPLICER:
+ splice_GeneSplicer(in_seqx,ls,us,le,ue,in_seqy,ys,ye,gtscore,agscore,ctscore,acscore,ori,nsegmentL,nsegmentR);
+ break;
+
+ case SPLICE_GLIMMER:
+ splice_Glimmer(in_seqx,ls,us,le,ue,in_seqy,ys,ye,gtscore,agscore,ctscore,acscore,ori,nsegmentL,nsegmentR,asegmentL,asegmentR);
+ break;
+
+ default:
+ fprintf(stderr, "Unrecognized splice model (%d). Using original.\n", spl_model);
+ splice_original(in_seqx,ls,us,le,ue,in_seqy,ys,ye,gtscore,agscore,ctscore,acscore,ori,nsegmentL,nsegmentR);
+ break;
+ }
+
+
+ if (ori==FWD) {
+ splice_donor_uni(in_seqx+ls-1, in_seqy+ys-1, us-ls+1, ye-ys+1,
+ gtscore, &max_Gf, &start_Gi);
+ splice_acceptor_uni(in_seqx+le-1, in_seqy+ys-1, ue-le+1, ye-ys+1,
+ agscore, &max_Gb, &end_Gi);
+ ckfree(gtscore); /* ckfree(agscore) */
+
+ } else if (ori==BWD) {
+ splice_donor_uni(in_seqx+ls-1, in_seqy+ys-1, us-ls+1, ye-ys+1,
+ ctscore, &max_Cf, &start_Ci);
+ splice_acceptor_uni(in_seqx+le-1, in_seqy+ys-1, ue-le+1, ye-ys+1,
+ acscore, &max_Cb, &end_Ci);
+ ckfree(ctscore); /* ckfree(acscore) */
+
+ } else {
+ splice_donor(in_seqx+ls-1, in_seqy+ys-1, us-ls+1, ye-ys+1,
+ gtscore, ctscore, &max_Gf, &max_Cf, &start_Gi, &start_Ci);
+ splice_acceptor(in_seqx+le-1, in_seqy+ys-1, ue-le+1, ye-ys+1,
+ agscore, acscore, &max_Gb, &max_Cb, &end_Gi, &end_Ci);
+ ckfree(gtscore); /* ckfree(agscore); */
+ ckfree(ctscore); /* ckfree(acscore); */
+ }
+
+#if 0
+ for (i=0; i<=ye-ys+1; i++) {
+ fprintf(stderr, "%3d: max_Gf=%1.6f max_Cf=%1.6f max_Gb=%1.6f max_Cb=%1.6f\n",
+ i,
+ max_Gf[i], max_Cf[i], max_Gb[i], max_Cb[i]);
+ }
+#endif
+
+ maxCscore = -999999; maxGscore = -999999;
+ Gxs = Gxe = Gy = Cxs = Cxe = Cy = -1;
+ if (ori==FWD || ori==BOTH) {
+ for (i=0; i<=ye-ys+1; i++) {
+ if ((tmpf=max_Gf[i]+max_Gb[i])>maxGscore) {
+ maxGscore = tmpf;
+ /* save (i, start_Gi[i], end_Gi[i]); */
+ Gxs = ls+start_Gi[i]-1; Gxe = le+end_Gi[i]-1; Gy = ys+i-1;
+ }
+ }
+ ckfree(max_Gf); ckfree(max_Gb);
+ ckfree(start_Gi); ckfree(end_Gi);
+ }
+ if (ori==BWD || ori==BOTH) {
+ for (i=0; i<=ye-ys+1; i++) {
+ if ((tmpf=max_Cf[i]+max_Cb[i])>maxCscore) {
+ maxCscore = tmpf;
+ /* save (i, start_Ci[i], end_Ci[i]); */
+ Cxs = ls+start_Ci[i]-1; Cxe = le+end_Ci[i]-1; Cy = ys+i-1;
+ }
+ }
+ ckfree(max_Cf); ckfree(max_Cb);
+ ckfree(start_Ci); ckfree(end_Ci);
+ }
+
+#if 0
+ fprintf(stderr, "%8d %8d %8d %8d %8f\n%8d %8d %8d %8d %f\n",
+ Gxs, Gxe, Gy, Gy+1, maxGscore,
+ Cxs, Cxe, Cy, Cy+1, maxCscore);
+#endif
+
+ *gcell = new_splice('G', Gxs, Gxe, Gy, Gy+1, maxGscore, NULL);
+ *ccell = new_splice('C', Cxs, Cxe, Cy, Cy+1, maxCscore, NULL);
+
+#ifdef DEBUG
+ printf("Type: %c sx: %d se: %d ys: %d score: %d\n",
+ gcell.type, gcell.xs, gcell.xe, gcell.ys, gcell.score);
+
+ printf("Type: %c sx: %d se: %d ys: %d score: %d\n",
+ ccell.type, ccell.xs, ccell.xe, ccell.ys, ccell.score);
+#endif
+
+ ckfree(nsegmentL); ckfree(nsegmentR);
+
+ if (spl_model==SPLICE_GLIMMER) {
+ ckfree(asegmentL); ckfree(asegmentR);
+ }
+
+ return;
+}
+
+
+/* Customized splice signal scoring functions:
+ splice_original(), splice_GeneSplicer(), splice_Glimmer() */
+
+void
+Sim4::splice_original(char *in_seqx, int ls, int us, int le, int ue,
+ char *in_seqy, int ys, int ye,
+ double *gtscore, double *agscore,
+ double *ctscore, double *acscore, int ori,
+ char *nsegmentL, char *nsegmentR)
+{
+ int p, q, i;
+ char *s,*t, ch;
+
+
+ /* changed MAX_SPAN to S4_SPAN; see main fix to out of bounds problems in util.C */
+ for (i=0, s=in_seqx+ls-S4_SPAN-1; i<2*S4_SPAN+us-ls+3; nsegmentL[i++] = spl_encode[(int)(*s++)]);
+ for (i=0, s=in_seqx+le-2-S4_SPAN-1; i<2*S4_SPAN+ue-le+3; nsegmentR[i++] = spl_encode[(int)(*s++)]);
+
+ if (ori==FWD || ori==BOTH) {
+
+ if (globalParams->_dontForceCanonicalSplicing) {
+ for (p=0, s=nsegmentL+S4_SPAN; p<=us-ls+1; p++, s++)
+ gtscore[p] = 0;
+ for (q=ue-le+1, s=nsegmentR+S4_SPAN+ue-le+2; q>=0; q--, s--)
+ agscore[q] = 0;
+ } else {
+ for (p=0, s=nsegmentL+S4_SPAN; p<=us-ls+1; p++, s++)
+ gtscore[p] = gt[(int)*s][(int)*(s+1)];
+ for (q=ue-le+1, s=nsegmentR+S4_SPAN+ue-le+2; q>=0; q--, s--)
+ agscore[q] = ag[(int)*(s-1)][(int)*s];
+ }
+ }
+
+
+ if (ori==BWD || ori==BOTH) {
+
+ /* reverse complement the nsegments, 0-3 alphabet */
+ for (s=nsegmentL, t=nsegmentL+2*S4_SPAN+us-ls+3-1; s<t; s++, t--)
+ { ch = 3-(*s); *s = 3-(*t); *t = ch; }
+ for (s=nsegmentR, t=nsegmentR+2*S4_SPAN+ue-le+3-1; s<t; s++, t--)
+ { ch = 3-(*s); *s = 3-(*t); *t = ch; }
+
+ if (globalParams->_dontForceCanonicalSplicing) {
+ for (p=0, s=nsegmentL+S4_SPAN+us-ls+2; p<=us-ls+1; p++, s++)
+ ctscore[p] = 0;
+ for (q=ue-le+1, s=nsegmentR+S4_SPAN; q>=0; q--, s--)
+ acscore[q] = 0;
+ } else {
+ for (p=0, s=nsegmentL+S4_SPAN+us-ls+2; p<=us-ls+1; p++, s--)
+ ctscore[p] = ag[(int)*(s-1)][(int)*s];
+ for (q=ue-le+1, s=nsegmentR+S4_SPAN; q>=0; q--, s++)
+ acscore[q] = gt[(int)*s][(int)*(s+1)];
+ }
+ }
+
+ return;
+}
+
+
+void
+Sim4::splice_GeneSplicer(char *in_seqx, int ls, int us, int le, int ue,
+ char *in_seqy, int ys, int ye,
+ double *gtscore, double *agscore,
+ double *ctscore, double *acscore, int ori,
+ char *nsegmentL, char *nsegmentR)
+{
+ int p, q, i;
+ char *s,*t, ch;
+
+
+
+ /* changed MAX_SPAN to GENESPLICER_SPAN; see main fix to out of bounds problems in util.C */
+ for (i=0, s=in_seqx+ls-GENESPLICER_SPAN-1; i<2*GENESPLICER_SPAN+us-ls+3; nsegmentL[i++] = spl_encode[(int)(*s++)]);
+ for (i=0, s=in_seqx+le-2-GENESPLICER_SPAN-1; i<2*GENESPLICER_SPAN+ue-le+3; nsegmentR[i++] = spl_encode[(int)(*s++)]);
+
+
+ if (ori==FWD || ori==BOTH) {
+
+ for (p=0, s=nsegmentL+GENESPLICER_SPAN; p<=us-ls+1; p++, s++) {
+ gtscore[p] = ScoreDonor_GeneSplicer(s-GENESPLICER_SPAN);
+
+ if (gtscore[p] < -14) gtscore[p] = -14.0;
+ if (gtscore[p] > 19) gtscore[p] = 19;
+ gtscore[p] = 5.0*(gtscore[p]+14.0)/33.0;
+ gtscore[p] = 0.4*gtscore[p] + 0.6*gt[(int)*s][(int)*(s+1)];
+ }
+ for (q=ue-le+1, s=nsegmentR+GENESPLICER_SPAN+ue-le+2; q>=0; q--, s--) {
+ agscore[q] = ScoreAcceptor_GeneSplicer(s-GENESPLICER_SPAN-1);
+
+ if (agscore[q] < -23) agscore[q] = -23.0;
+ if (agscore[q] > 20) agscore[q] = 20.0;
+ agscore[q] = 5.0*(agscore[q]+23.0)/43.0;
+ agscore[q] = 0.4*agscore[q] + 0.6*ag[(int)*(s-1)][(int)*s];
+ }
+
+#if 0
+ printf("gtscore:"); for (p=0; p<=us-ls+1; p++) printf(" %f", gtscore[p]); printf("\n");
+ printf("agscore:"); for (q=ue-le+1; q>=0; q--) printf(" %f", agscore[q]); printf("\n");
+#endif
+ }
+
+ if (ori==BWD || ori==BOTH) {
+
+ /* reverse complement the nsegments, 0-3 alphabet */
+ for (s=nsegmentL, t=nsegmentL+2*GENESPLICER_SPAN+us-ls+3-1; s<t; s++, t--)
+ { ch = 3-(*s); *s = 3-(*t); *t = ch; }
+ for (s=nsegmentR, t=nsegmentR+2*GENESPLICER_SPAN+ue-le+3-1; s<t; s++, t--)
+ { ch = 3-(*s); *s = 3-(*t); *t = ch; }
+
+
+ for (p=0, s=nsegmentL+GENESPLICER_SPAN+us-ls+2; p<=us-ls+1; p++, s--) {
+ ctscore[p] = ScoreAcceptor_GeneSplicer(s-GENESPLICER_SPAN-1);
+
+
+ if (ctscore[p] < -23) ctscore[p] = -23.0;
+ if (ctscore[p] > 20) ctscore[p] = 20.0;
+ ctscore[p] = 5.0*(ctscore[p]+23.0)/43.0;
+ ctscore[p] = 0.4*ctscore[p] + 0.6*ag[(int)*(s-1)][(int)*s];
+ }
+ for (q=ue-le+1, s=nsegmentR+GENESPLICER_SPAN; q>=0; q--, s++) {
+ acscore[q] = ScoreDonor_GeneSplicer(s-GENESPLICER_SPAN);
+
+
+ if (acscore[q] < -14) acscore[q] = -14.0;
+ if (acscore[q] > 19) acscore[q] = 19.0;
+ acscore[q] = 5.0*(acscore[q]+14.0)/33.0;
+ acscore[q] = 0.4*acscore[q] + 0.6*gt[(int)*s][(int)*(s+1)];
+ }
+#if 0
+ printf("ctscore:"); for (p=0; p<=us-ls+1; p++) printf(" %f", ctscore[p]); printf("\n");
+ printf("acscore:"); for (q=ue-le+1; q>=0; q--) printf(" %f", acscore[q]); printf("\n");
+#endif
+ }
+
+ return;
+}
+
+
+void
+Sim4::splice_Glimmer(char *in_seqx, int ls, int us, int le, int ue,
+ char *in_seqy, int ys, int ye,
+ double *gtscore, double *agscore,
+ double *ctscore, double *acscore, int ori,
+ char *nsegmentL, char *nsegmentR, char *asegmentL, char *asegmentR)
+{
+ int p, q, i;
+ char *s,*t, ch;
+
+
+ /* changed MAX_SPAN to GLIMMER_SPAN; see also main fix to out of bounds problems in util.C */
+ for (i=0, s=in_seqx+ls-GLIMMER_SPAN-1; i<2*GLIMMER_SPAN+us-ls+3; nsegmentL[i++] = spl_encode[(int)(*s++)]);
+ for (i=0, s=in_seqx+le-2-GLIMMER_SPAN-1; i<2*GLIMMER_SPAN+ue-le+3; nsegmentR[i++] = spl_encode[(int)(*s++)]);
+
+
+ /* Glimmer specific matrices */
+ for (i=0, s=in_seqx+ls-GLIMMER_SPAN-1; i<2*GLIMMER_SPAN+us-ls+3; asegmentL[i++] = *s++);
+ for (i=0, s=in_seqx+le-2-GLIMMER_SPAN-1; i<2*GLIMMER_SPAN+ue-le+3; asegmentR[i++] = *s++);
+
+ if (ori==FWD || ori==BOTH) {
+
+ for (p=0, s=nsegmentL+GLIMMER_SPAN, t=asegmentL+GLIMMER_SPAN; p<=us-ls+1; p++, s++, t++) {
+ gtscore[p] = ScoreDonor_Glimmer(t-GLIMMER_XSPAN, Glimmer_TRAIN_DIR);
+
+ if (gtscore[p] < 0) gtscore[p] = 0.0;
+ if (gtscore[p] > 0.31) gtscore[p] = 0.31;
+ gtscore[p] = 5.0*(gtscore[p]+0.0)/0.31;
+ gtscore[p] = 0.2*gtscore[p] + 0.8*gt[(int)*s][(int)*(s+1)];
+ }
+ for (q=ue-le+1, s=nsegmentR+GLIMMER_SPAN+ue-le+2, t=asegmentR+GLIMMER_SPAN+ue-le+2; q>=0; q--, s--, t--) {
+ agscore[q] = ScoreAcceptor_Glimmer(t-GLIMMER_ISPAN-1, Glimmer_TRAIN_DIR);
+
+
+ if (agscore[q] < -0.16) agscore[q] = -0.16;
+ if (agscore[q] > 0.23) agscore[q] = 0.23;
+ agscore[q] = 5.0*(agscore[q]+0.16)/0.39;
+ agscore[q] = 0.2*agscore[q] + 0.8*ag[(int)*(s-1)][(int)*s];
+ }
+ }
+
+
+ if (ori==BWD || ori==BOTH) {
+
+ /* reverse complement the nsegments, 0-3 alphabet */
+ for (s=nsegmentL, t=nsegmentL+2*GLIMMER_SPAN+us-ls+3-1; s<t; s++, t--)
+ { ch = 3-(*s); *s = 3-(*t); *t = ch; }
+ for (s=nsegmentR, t=nsegmentR+2*GLIMMER_SPAN+ue-le+3-1; s<t; s++, t--)
+ { ch = 3-(*s); *s = 3-(*t); *t = ch; }
+
+
+
+
+ /* reverse complement the asegments, ACTG alphabet */
+ for (s=asegmentL, t=asegmentL+2*GLIMMER_SPAN+us-ls+3-1; s<t; s++, t--)
+ { ch = rev_compl[(int)*s]; *s = rev_compl[(int)*t]; *t = ch; }
+ for (s=asegmentR, t=asegmentR+2*GLIMMER_SPAN+ue-le+3-1; s<t; s++, t--)
+ { ch = rev_compl[(int)*s]; *s = rev_compl[(int)*t]; *t = ch; }
+
+
+ for (p=0, s=nsegmentL+GLIMMER_SPAN+us-ls+2, t=asegmentL+GLIMMER_SPAN+us-ls+2; p<=us-ls+1; p++, s--, t--) {
+ ctscore[p] = ScoreAcceptor_Glimmer(t-GLIMMER_ISPAN-1, Glimmer_TRAIN_DIR);
+
+
+ if (ctscore[p] < -0.16) ctscore[p] = -0.16;
+ if (ctscore[p] > 0.23) ctscore[p] = 0.23;
+ ctscore[p] = 5.0*(ctscore[p]+0.16)/0.39;
+ ctscore[p] = 0.2*ctscore[p] + 0.8*ag[(int)*(s-1)][(int)*s];
+ }
+ for (q=ue-le+1, s=nsegmentR+GLIMMER_SPAN, t=asegmentR+GLIMMER_SPAN; q>=0; q--, s++, t++) {
+ acscore[q] = ScoreDonor_Glimmer(t-GLIMMER_XSPAN, Glimmer_TRAIN_DIR);
+
+
+ if (acscore[q] < 0) acscore[q] = 0.0;
+ if (acscore[q] > 0.31) acscore[q] = 0.31;
+ acscore[q] = 5.0*(acscore[q]+0.0)/0.31;
+ acscore[q] = 0.2*acscore[q] + 0.8*gt[(int)*s][(int)*(s+1)];
+ }
+ }
+
+ return;
+}
+
+void
+Sim4::splice_close ()
+{
+ UnLoadSites_GeneSplicer();
+
+ spliceInit = 0;
+}
+
diff --git a/libsim4/sim4core/table.C b/libsim4/sim4core/table.C
new file mode 100644
index 0000000..e846f61
--- /dev/null
+++ b/libsim4/sim4core/table.C
@@ -0,0 +1,174 @@
+#include "sim4.H"
+
+// The position of a mer (word) is the position of the last base
+// (base-based). Note that the sequence starts at position 1.
+//
+// 11111111112
+// 12345678901234567890
+// acgggctactcgaggcta
+//
+// First mer is at position 12.
+//
+
+
+void
+Sim4::add_word(int ecode, int pos) {
+ struct hash_node *h;
+ int hval;
+
+ hval = ecode & HASH_SIZE;
+
+ // Find the word in the hash table
+ //
+ for (h = hashtable->table[hval]; h; h = h->link)
+ if (h->ecode == ecode)
+ break;
+
+ // Didn't find the word? Add a new one!
+ //
+ if (h == NULL) {
+ h = hashtable->nodes + hashtable->nodesused++;
+ h->link = hashtable->table[hval];
+ hashtable->table[hval] = h;
+
+ h->ecode = ecode;
+ h->pos = -1;
+ }
+
+ // Set the position -- this keeps a list of words from high
+ // position to low position.
+ //
+ hashtable->nextPos[pos] = h->pos;
+ h->pos = pos;
+}
+
+
+void
+Sim4::bld_table(char *s, int len, mss_t MSS, int type) {
+ uint64 ecode;
+ int i, j, masked_ecode;
+ char *t;
+
+ if (type == PERM) {
+ mask = (1 << (2*MSS.seedLength-2)) - 1; /* LLL 6/16/10 we are setting this for continuous seeds, where 2*seedLength=matchedLength; no effect if seed is spaced */
+ hashtable = &phashtable;
+ return;
+ }
+
+ /* perform initializations */
+ if (type == INIT) {
+ mask = (1 << (2*MSS.seedLength-2)) - 1; /* LLL 6/16/10 we are setting this for continuous seeds, where 2*seedLength=matchedLength; no effect if seed is spaced */
+
+ hashtable = &phashtable;
+
+ if (phashtable.nextPos) {
+ delete [] phashtable.nextPos;
+ delete [] phashtable.nodes;
+ }
+
+ phashtable.nextPos = new int [len+1];
+ phashtable.nodes = new struct hash_node [len+1];
+ phashtable.nodesused = 0;
+
+ for (i=0; i<HASH_SIZE+1; ++i)
+ phashtable.table[i] = uint64ZERO;
+ } else if (type == TEMP) {
+ mask = (1 << (2*MSS.seedLength-2)) - 1; /* LLL 6/16/10 we are setting this for continuous seeds, where 2*seedLength=matchedLength; no effect if seed is spaced */
+
+ hashtable = &thashtable;
+
+ if (thashtable.nextPos) {
+ delete [] thashtable.nextPos;
+ delete [] thashtable.nodes;
+ }
+
+ thashtable.nextPos = new int [len+1];
+ thashtable.nodes = new struct hash_node [len+1];
+ thashtable.nodesused = 0;
+
+ for (i=0; i<HASH_SIZE+1; ++i)
+ thashtable.table[i] = 0L;
+ } else {
+ fprintf(stderr, "unknown type in bld_table: %d\n", type);
+ }
+
+ // skip any word containing an N/X
+
+ int emer;
+
+ // This is because seq-1 is passed in
+ //
+ t = s+1;
+
+ if (MSS.type == CONTINUOUS_SEED) {
+ for (i=1; (i<=len) && *t; ) {
+ restart_c:
+ ecode = uint64ZERO;
+
+ for (j=1; (j<MSS.seedLength) && (i<=len) && *t; ++j) {
+ emer = encoding[(int)(*t++)];
+ i++;
+
+ if (emer < 0)
+ goto restart_c;
+
+ ecode <<= 2;
+ ecode |= emer;
+ }
+
+ for (; (i<=len) && *t; ) {
+ emer = encoding[(int)(*t++)];
+ i++;
+
+ if (emer < 0)
+ goto restart_c;
+
+ ecode &= mask;
+ ecode <<= 2;
+ ecode |= emer;
+
+ add_word(ecode, (int)(t-s-1));
+ }
+ }
+ } else {
+ /* SPACED_SEED */
+ for (i=1; (i<=len) && *t; ) {
+ restart_s:
+ ecode = uint64ZERO;
+
+ for (j=1; (j<MSS.seedLength) && (i<=len) && *t; ++j) {
+ emer = encoding[(int)(*t++)];
+ i++;
+
+ if (emer < 0)
+ goto restart_s;
+
+ ecode <<= 2;
+ ecode |= emer;
+ }
+
+ for (; (i<=len) && *t; ) {
+ emer = encoding[(int)(*t++)];
+ i++;
+
+ if (emer < 0)
+ goto restart_s;
+
+ ecode &= MSS.mask;
+ ecode <<= 2;
+ ecode |= emer;
+
+#if 1
+ // much cheaper...
+
+ for (j=masked_ecode=0; j<MSS.masknum; j++)
+ masked_ecode += (ecode & MSS.masks[j]) >> MSS.shifts[j];
+
+ add_word(masked_ecode, (int)(t-s-1));
+#else
+ add_word(masked_shift(ecode), (int)(t-s-1));
+#endif
+ }
+ }
+ }
+}
diff --git a/libsim4/sim4core/util.C b/libsim4/sim4core/util.C
new file mode 100644
index 0000000..d45d3ce
--- /dev/null
+++ b/libsim4/sim4core/util.C
@@ -0,0 +1,832 @@
+#include "sim4.H"
+
+
+// Original call was if (!strncmp(S, "GT", 2)) {}
+// which is if (S == "GT")
+//
+#define DAcmp(S, A, B) (((S)[0] == A) && ((S)[1] == B))
+
+
+void
+Sim4::complement_exons(Exon **left, int M, int N) {
+ Exon *tmp_block, *right;
+ char prev, ch;
+
+#ifdef SPLSCORE
+ double spl=0, prevspl=0;
+#endif
+
+ prev = 'U'; /* unknown, should trigger error */
+ tmp_block = *left;
+ while (tmp_block) {
+ if (tmp_block->toGEN) {
+ register int aux;
+
+ if (tmp_block->next_exon && tmp_block->next_exon->toGEN) {
+ ch = tmp_block->ori;
+ tmp_block->ori = prev;
+
+#ifdef SPLSCORE
+ spl = tmp_block->splScore;
+ tmp_block->splScore = prevspl;
+ prevspl = spl;
+#endif
+
+ switch (ch) {
+ case 'C':
+ prev = 'G';
+ break;
+ case 'G':
+ prev = 'C';
+ break;
+ case 'N':
+ prev = 'N';
+ break;
+ case 'E':
+ prev = 'E';
+ break;
+ default:
+ fatal("sim4b1.c: Inconsistency. Check exon orientation at complementation.");
+ }
+ } else {
+ tmp_block->ori = prev;
+#ifdef SPLSCORE
+ tmp_block->splScore = prevspl;
+#endif
+ }
+ aux = tmp_block->frGEN;
+ tmp_block->frGEN = M+1-tmp_block->toGEN;
+ tmp_block->toGEN = M+1-aux;
+ aux = tmp_block->frEST;
+ tmp_block->frEST = N+1-tmp_block->toEST;
+ tmp_block->toEST = N+1-aux;
+ }
+ tmp_block = tmp_block->next_exon;
+ if (tmp_block && tmp_block->toGEN)
+ right = tmp_block;
+ }
+ flip_list(left,&right);
+}
+
+
+
+
+
+
+void
+Sim4::get_stats(Exon *lblock, sim4_stats_t *st) {
+ Exon *t, *t1;
+ bool singleExon = true;
+
+ st->icoverage = 0;
+ st->internal = 1;
+
+ if ((lblock->next_exon == NULL) || !lblock->next_exon->toGEN)
+ st->internal = 0;
+
+ for (t=lblock->next_exon; t; t = t->next_exon)
+ st->icoverage += t->length;
+
+ t = lblock;
+ while (t) {
+ t1 = t->next_exon;
+ if (t->toGEN && t1 && t1->toGEN) singleExon = false;
+
+ if ((t->toGEN) &&
+ (t1) &&
+ (t1->frEST - t->toEST - 1 > 0) &&
+ t1->toGEN)
+ st->internal = 0;
+
+ t = t1;
+ }
+
+ if (!globalParams->_forceStrandPrediction) {
+ if (((st->orientation != BOTH) && (!globalParams->_interspecies && (st->percentID < 90))) ||
+ (!globalParams->_interspecies && (st->internal == 0)) ||
+ singleExon) {
+ st->orientation = BOTH;
+ }
+ }
+}
+
+
+
+
+
+void
+Sim4::flip_list(Exon **left, Exon **right)
+{
+ Exon *ep, *ahead, *behind;
+
+ *right = *left;
+ ahead = *left;
+ ep = NULL;
+ while (ahead!=NULL) {
+ behind = ep;
+ ep = ahead;
+ ahead = ahead->next_exon;
+ ep->next_exon = behind;
+ }
+ *left = ep;
+}
+
+
+
+/* operates on a list sorted in increasing order of exon coordinates */
+void
+Sim4::compact_list(Exon **Lblock, Exon **Rblock, int SI)
+{
+ Exon *tmp_block=*Lblock, *tmp_block1;
+ int diff;
+
+ while ((tmp_block!=NULL) &&
+ ((tmp_block1=tmp_block->next_exon)!=NULL) &&
+ tmp_block1->toGEN) {
+ if ((abs((tmp_block1->frEST-tmp_block1->frGEN) -
+ (tmp_block->toEST-tmp_block->toGEN))<=SI) &&
+ ((diff=tmp_block1->frEST-tmp_block->toEST-1)<=MAX_INTERNAL_GAP)) {
+ /* merge blocks */
+ tmp_block->toGEN = tmp_block1->toGEN;
+ tmp_block->toEST = tmp_block1->toEST;
+ tmp_block->length = tmp_block->toEST-tmp_block->frEST+1;
+ tmp_block->edist += tmp_block1->edist;
+ tmp_block->edist -= (int)(globalParams->_percentError * diff);
+ tmp_block->next_exon = tmp_block1->next_exon;
+
+ //freeExon(tmp_block1); garbage collected
+ } else
+ tmp_block = tmp_block1;
+ }
+ /* reset right end of the list */
+ *Rblock = tmp_block;
+}
+
+/* ------------------ memory management routines --------------- */
+
+
+int
+Sim4::good_ratio(int length)
+{
+ if (length<=wordSize/2) return 2;
+ else if (length<2*wordSize) return DIST_CUTOFF;
+ else return (int)(.75 * globalParams->_percentError * length + 1);
+}
+
+
+void
+Sim4::merge(Exon **t0, Exon **t1)
+{
+ Exon *tmp0, *tmp1;
+ int diff;
+
+ if ((*t0) && !(*t0)->toGEN)
+ tmp0 = (*t0)->next_exon;
+ else
+ tmp0 = *t0;
+
+ while (tmp0 && (tmp0!=*t1)) {
+ tmp1 = tmp0->next_exon;
+ assert(tmp1!=NULL);
+
+ if (tmp1 && tmp1->toGEN && tmp0->toGEN &&
+ (abs((tmp1->frEST-tmp1->frGEN)-(tmp0->toEST-tmp0->toGEN))<=wordSize) &&
+ ((tmp1->frEST - tmp0->toEST - 1 <= wordSize))) {
+
+ diff = tmp1->frEST - tmp0->toEST - 1;
+
+ /* merge blocks tmp0 and tmp1 */
+ tmp0->frGEN = MIN(tmp0->frGEN, tmp1->frGEN);
+ tmp0->frEST = MIN(tmp0->frEST, tmp1->frEST);
+ tmp0->toGEN = MAX(tmp1->toGEN, tmp0->toGEN);
+ tmp0->toEST = MAX(tmp1->toEST, tmp0->toEST);
+ tmp0->length = tmp0->toEST-tmp0->frEST+1;
+ tmp0->flag = tmp1->flag;
+ tmp0->edist += tmp1->edist;
+ tmp0->edist -= (int)(globalParams->_percentError * diff);
+ if (tmp1==*t1) {
+ /* tmp0->flag = (*t1)->flag; */
+ *t1 = tmp0;
+ }
+ tmp0->next_exon = tmp1->next_exon;
+
+ //freeExon(tmp1); garbage collected
+ } else {
+ tmp0 = tmp0->next_exon;
+ }
+ }
+}
+
+void
+Sim4::free_align(edit_script_list *aligns) {
+ edit_script_list *head;
+
+ head = aligns;
+
+ while ((head=aligns)!=NULL) {
+ aligns = aligns->next_script;
+ Free_script(head->script);
+ ckfree(head);
+ }
+}
+
+
+
+Exon *
+Sim4::bmatch (char *s1, char *s2, int l1, int l2, int offset1, int offset2)
+{
+ int i, j, i1, score;
+ Exon *newthing=NULL;
+
+ for (i1=i=l1-3; i>=l2-3; i--, i1=i) {
+ for (j=l2-3; j>=2; j--, i1--)
+ if (*(s1+i1)!=*(s2+j))
+ break;
+
+ if (j<2) {
+ /* exact match for CDS found; check signals */
+ score = 0;
+ if (*(s1+(i1--))==*(s2+(j--))) score++;
+ if (*(s1+(i1--))==*(s2+(j--))) score++;
+ if (*(s1+i1+l2-1)==*(s2+j+l2-1)) score++;
+ if (*(s1+i1+l2)==*(s2+j+l2)) score++;
+ if (score>=3) {
+ newthing = _exonManager.newExon(i1+3+offset1, offset2, i1+3+offset1+l2-5,
+ offset2+l2-5, l2-4, 0, 0, NULL);
+ newthing->ori = (G_score >= abs(C_score)) ? 'G' : 'C';
+
+ return newthing;
+ }
+ }
+ }
+ return NULL;
+}
+
+Exon *
+Sim4::fmatch (char *s1, char *s2, int l1, int l2, int offset1, int offset2)
+{
+ int i, j, i1, score;
+ Exon *newthing=NULL;
+
+ for (i1=i=2; i<l1-l2+3; i++, i1=i) {
+ for (j=2; j<l2-2; j++, i1++)
+ if (*(s1+i1)!=*(s2+j))
+ break;
+
+ if (j>=l2-2) {
+ /* exact match found for internal part, look for signals */
+ score = 0;
+ if (*(s1+(i1++))==*(s2+(j++))) score++;
+ if (*(s1+(i1++))==*(s2+(j++))) score++;
+ if (*(s1+i1-l2)==*s2) score++;
+ if (*(s1+i1-l2+1)==*(s2+1)) score++;
+ if (score>=3) {
+ newthing = _exonManager.newExon(i+offset1,offset2,i1+offset1-2,offset2+l2-5,
+ l2-4,0,0,NULL);
+ newthing->ori = (G_score >= abs(C_score)) ? 'G' : 'C';
+
+ return newthing;
+ }
+ }
+ }
+ return NULL;
+}
+
+
+/* -------------------- to be added to psublast ---------------------- */
+
+bool
+Sim4::get_sync_flag(Exon *lblock, Exon *rblock, int w)
+{
+ int numx=0, e2;
+ Exon *t;
+
+ if (((t=lblock->next_exon)==NULL) || !t->toGEN)
+ return 0;
+ numx++;
+ e2 = t->toEST;
+
+ while (((t=t->next_exon)!=NULL) && t->toGEN) {
+ ++numx;
+ if ((t->frEST-e2>1) ||
+ (t!=rblock && ((t->toEST-t->frEST+1<2*w+2) || (t->toGEN-t->frGEN+1<2*w+2))))
+ return 0;
+ e2 = t->toEST;
+ }
+
+ return ((numx<3) ? 0:1);
+}
+
+
+
+void
+Sim4::sync_slide_intron(int in_w, Exon *first, Exon *last, int spl_model, sim4_stats_t *st) {
+ Exon *t0=NULL, *t1=NULL, *head = first;
+ splice_t *g=NULL, *c=NULL, *cell=NULL;
+ splice_t **Glist, **Clist;
+ int Gscore=0, Cscore=0;
+ char *oris;
+ int w1, w2, ni, i, numC, numG, model;
+
+ ni = 0;
+ numG = numC = 0;
+
+ // Count the exons to allocate space for Glist, Clist and oris
+ //
+ t0 = head;
+ while (t0 && (t0!=last) && (t1=t0->next_exon) && t1->toGEN) {
+ ni++;
+ t0 = t1;
+ }
+
+ Glist = (splice_t **)ckalloc((ni + 1) * sizeof(splice_t *));
+ Clist = (splice_t **)ckalloc((ni + 1) * sizeof(splice_t *));
+ oris = (char *) ckalloc((ni + 1) * sizeof(char));
+
+ memset(Glist, 0, (ni + 1) * sizeof(splice_t *));
+ memset(Clist, 0, (ni + 1) * sizeof(splice_t *));
+ memset(oris, 0, (ni + 1) * sizeof(char));
+
+ if ((Glist == 0L) || (Clist == 0L) || (oris == 0L)) {
+ fprintf(stderr, "Can't allocate memory for sync_slide_intron() with %d exons.\n", ni);
+ exit(1);
+ }
+
+ ni = 0;
+
+ /* assume forward orientation */
+ t0 = head;
+ while (t0 && (t0!=last) && (t1=t0->next_exon) && t1->toGEN) {
+ g = c = NULL;
+ if (t1->frEST-t0->toEST-1==0) {
+ if (!strncmp((char *)(_genSeq+t0->toGEN),"GT",2) &&
+ !strncmp((char *)(_genSeq+t1->frGEN-3),"AG",2)) {
+ g = new_splice('G',t0->toGEN,t1->frGEN,t0->toEST,t1->frEST,-1,NULL);
+ t0->ori = 'G';
+ oris[ni] = 'G';
+ numG++;
+#ifdef SPLSCORE
+ t0->splScore = 999999;
+#endif
+ } else if (!strncmp((char *)(_genSeq+t0->toGEN),"CT",2) &&
+ !strncmp((char *)(_genSeq+t1->frGEN-3),"AC",2)) {
+ c = new_splice('C',t0->toGEN,t1->frGEN,t0->toEST,t1->frEST,-1,NULL);
+ t0->ori = 'C';
+ oris[ni] = 'C';
+ numC++;
+#ifdef SPLSCORE
+ t0->splScore = 888888;
+#endif
+ } else {
+ w1 = MIN(in_w, (int)(0.5*MIN(t0->length-1, t0->toGEN-t0->frGEN)));
+ w2 = MIN(in_w, (int)(0.5*MIN(t1->length-1, t1->toGEN-t1->frGEN)));
+ model = ((t0->toGEN-w1<=MAX_SPAN) || (t1->frGEN+w2+MAX_SPAN+2>_genLen)) ?
+ SPLICE_ORIGINAL : spl_model;
+ splice(_genSeq, t0->toGEN-w1, t0->toGEN+w1, t1->frGEN-w2, t1->frGEN+w2,
+ _estSeq, t0->toEST-w1, t1->frEST+w2, &g, &c, BOTH, model);
+
+ Gscore += g->score; Cscore += c->score;
+ cell = NULL; oris[ni] = '*';
+ if (g->score>c->score) {
+ numG++; cell = g; oris[ni] = 'G';
+ } else if (c->score>g->score) {
+ numC++; cell = c; oris[ni] = 'C';
+ } else if (c->score==g->score) {
+ numG++; numC++; cell = g; oris[ni] = 'G';
+ }
+#ifdef SPLSCORE
+ t0->splScore = (model==spl_model) ? cell->score : 777777;
+#endif
+ t0->ori = oris[ni];
+ t0->toGEN = cell->xs; t0->toEST = cell->ys;
+ t1->frGEN = cell->xe; t1->frEST = cell->ye;
+ t0->length = t0->toEST-t0->frEST+1;
+ t1->length = t1->toEST-t1->frEST+1;
+ }
+ Clist[ni] = c; Glist[ni] = g;
+ } else {
+ t0->ori = 'E'; oris[ni] = 'E';
+ }
+ ni++;
+ t0 = t1;
+ }
+
+ st->orientation = BOTH;
+
+ if ((numG==1) && (numC==1) &&
+ (!Glist[0] || !Clist[0] || !Glist[1] || !Clist[1])) goto free_all;
+
+ if (numG && numG>=numC) {
+ /* revisit all previous assignments that are inconsistent */
+ for (i=0, t0=head; i<ni; i++, t0=t1) {
+ t1 = t0->next_exon;
+ switch (oris[i]) {
+ case 'G': break;
+ case 'C': if (Glist[i]==NULL) {
+ /* compute the values for C */
+ w1 = MIN(in_w, (int)(0.5*MIN(t0->length-1, t0->toGEN-t0->frGEN)));
+ w2 = MIN(in_w, (int)(0.5*MIN(t1->length-1, t1->toGEN-t1->frGEN)));
+ model = ((t0->toGEN-w1<=MAX_SPAN) || (t1->frGEN+w2+MAX_SPAN+2>_genLen)) ?
+ SPLICE_ORIGINAL : spl_model;
+ splice(_genSeq, t0->toGEN-w1, t0->toGEN+w1,
+ t1->frGEN-w2, t1->frGEN+w2, _estSeq,
+ t0->toEST-w1, t1->frEST+w2, &g, &c, FWD, model);
+ } else g = Glist[i];
+
+#ifdef SPLSCORE
+ t0->splScore = (model==spl_model) ? g->score : 777777;
+#endif
+
+ t0->ori = 'G';
+ t0->toGEN = g->xs; t0->toEST = g->ys;
+ t1->frGEN = g->xe; t1->frEST = g->ye;
+ t0->length = t0->toEST-t0->frEST+1;
+ t1->length = t1->toEST-t1->frEST+1;
+
+ break;
+ case 'E': break;
+ default : fatal("sim4b1.c: intron orientation not initialized.");
+ }
+ if (oris[i]!='E') wobble(t0,t1,"GT","AG",_genSeq);
+ }
+
+ st->orientation = FWD;
+ } else if (numC) {
+ /* analyze all assignments for consistency */
+ for (i=0, t0=head; i<ni; i++, t0=t1) {
+ t1 = t0->next_exon;
+ switch (oris[i]) {
+ case 'C': break;
+ case 'G': if (Clist[i]==NULL) {
+ /* compute the values for C */
+ w1 = MIN(in_w, (int)(0.5*MIN(t0->length-1, t0->toGEN-t0->frGEN)));
+ w2 = MIN(in_w, (int)(0.5*MIN(t1->length-1, t1->toGEN-t1->frGEN)));
+ model = ((t0->toGEN-w1<=MAX_SPAN) || (t1->frGEN+w2+MAX_SPAN+2>_genLen)) ?
+ SPLICE_ORIGINAL : spl_model;
+ splice(_genSeq, t0->toGEN-w1, t0->toGEN+w1,
+ t1->frGEN-w2, t1->frGEN+w2,
+ _estSeq, t0->toEST-w1, t1->frEST+w2, &g, &c, BWD, model);
+ } else c = Clist[i];
+
+#ifdef SPLSCORE
+ t0->splScore = (spl_model==model) ? c->score : 777777;
+#endif
+ t0->ori = 'C';
+ t0->toGEN = c->xs; t0->toEST = c->ys;
+ t1->frGEN = c->xe; t1->frEST = c->ye;
+ t0->length = t0->toEST-t0->frEST+1;
+ t1->length = t1->toEST-t1->frEST+1;
+ break;
+ case 'E': break;
+ default : fatal("sim4b1.c: intron orientation not initialized.");
+ }
+ if (oris[i]!='E') wobble(t0,t1,"CT","AC",_genSeq);
+ }
+
+ st->orientation = BWD;
+ }
+
+ /* now free all memory allocated */
+ free_all:
+ for (i=0; i<ni; i++) {
+ ckfree(Glist[i]);
+ ckfree(Clist[i]);
+ }
+
+ ckfree(Glist);
+ ckfree(Clist);
+ ckfree(oris);
+
+ return;
+}
+
+
+void
+Sim4::wobble(Exon *t0,
+ Exon *t1,
+ const char *donor,
+ const char *acceptor,
+ char *seq) {
+ char *s = seq + t0->toGEN; // first nt of donor
+ char *q = seq + t1->frGEN - 3; // first nt of acceptor
+
+ if (DAcmp(s, donor[0], donor[1])) {
+ /* match in place */
+ if (DAcmp(q, acceptor[0], acceptor[1])) {
+ return;
+ } else if (DAcmp(q-1, acceptor[0], acceptor[1])) {
+ t1->frGEN--;
+ return;
+ } else if (DAcmp(q+1, acceptor[0], acceptor[1])) {
+ t1->frGEN++;
+ return;
+ }
+ } else if (DAcmp(s-1, donor[0], donor[1])) {
+ /* match is 1 off to the left */
+ if (DAcmp(q, acceptor[0], acceptor[1])) {
+ t0->toGEN--;
+ return;
+ } else if (DAcmp(q-1, acceptor[0], acceptor[1])) {
+ t0->toGEN--;
+ t1->frGEN--;
+ t0->toEST--;
+ t1->frEST--;
+ t0->length++;
+ t1->length--;
+ return;
+ } else if (DAcmp(q+1, acceptor[0], acceptor[1])) {
+ t0->toGEN--;
+ t1->frGEN++;
+ return;
+ }
+ } else if (DAcmp(s+1, donor[0], donor[1])) {
+ /* match is 1 off to the right */
+ if (DAcmp(q, acceptor[0], acceptor[1])) {
+ t0->toGEN++;
+ return;
+ } else if (DAcmp(q-1, acceptor[0], acceptor[1])) {
+ t0->toGEN++;
+ t1->frGEN--;
+ return;
+ } else if (DAcmp(q+1, acceptor[0], acceptor[1])) {
+ t0->toGEN++;
+ t1->frGEN++;
+ t0->toEST++;
+ t1->frEST++;
+ t0->length--;
+ t1->length++;
+ return;
+ }
+ } else if (DAcmp(q-1, acceptor[0], acceptor[1])) {
+ /* match is 1 off to the left */
+ t1->frGEN--;
+ return;
+ } else if (DAcmp(q+1, acceptor[0], acceptor[1])) {
+ /* match is 1 off to the right */
+ t1->frGEN++;
+ return;
+ }
+}
+
+
+void
+Sim4::slide_intron(int in_w, Exon *first, Exon *last, int spl_model, sim4_stats_t *st)
+{
+ Exon *t0, *t1, *head = first;
+ splice_t *g, *c, *cell;
+ char type;
+ int w1, w2;
+ int numG=0, numC=0, numE=0, numN=0, model;
+
+ t0 = head;
+ while (t0 && (t0!=last) && (t1=t0->next_exon) && t1->toGEN) {
+ g = c = NULL;
+ if (t1->frEST-t0->toEST-1==0) {
+ if (!strncmp((char *)(_genSeq+t0->toGEN),"GT",2) &&
+ !strncmp((char *)(_genSeq+t1->frGEN-3),"AG",2)) {
+ t0->ori = 'G';
+ numG++;
+#ifdef SPLSCORE
+ t0->splScore = 999999;
+#endif
+ }
+ else if (!strncmp((char *)(_genSeq+t0->toGEN),"CT",2) &&
+ !strncmp((char *)(_genSeq+t1->frGEN-3),"AC",2)) {
+ t0->ori = 'C';
+ numC++;
+#ifdef SPLSCORE
+ t0->splScore = 888888;
+#endif
+ }
+ else {
+ int gtag=0, ctac=0;
+ char *s;
+
+ w1 = MIN(in_w, (int)(0.5*MIN(t0->length-2, t0->toGEN-t0->frGEN-1)));
+ w2 = MIN(in_w, (int)(0.5*MIN(t1->length-2, t1->toGEN-t1->frGEN-1)));
+ model = ((t0->toGEN-w1<=MAX_SPAN) || (t1->frGEN+w2+MAX_SPAN+2>_genLen)) ?
+ SPLICE_ORIGINAL : spl_model;
+ splice(_genSeq, t0->toGEN-w1, t0->toGEN+w1, t1->frGEN-w2, t1->frGEN+w2,
+ _estSeq, t0->toEST-w1, t1->frEST+w2, &g, &c, BOTH, model);
+ if (g->score>c->score) { cell = g; type = 'G'; }
+ else if (c->score>g->score) { cell = c; type = 'C'; }
+ else { cell = g; type = 'G'; }
+
+#ifdef SPLSCORE
+ t0->splScore = (model==spl_model) ? cell->score : 777777;
+#endif
+
+ t0->toGEN = cell->xs; t0->toEST = cell->ys;
+ t1->frGEN = cell->xe; t1->frEST = cell->ye;
+ t0->length = t0->toEST-t0->frEST+1;
+ t1->length = t1->toEST-t1->frEST+1;
+
+ wobble(t0,t1,(type=='G')? "GT":"CT",(type=='G')? "AG":"AC",_genSeq);
+
+ ckfree(g);
+ ckfree(c);
+
+ /* determine the type, based on the # matches w/ GT-AG (CT-AC) */
+ s = _genSeq+t0->toGEN;
+ if (*s=='G') gtag++; else if (*s=='C') ctac++;
+ ++s;
+ if (*s=='T') { gtag++; ctac++;}
+ s = _genSeq+t1->frGEN-3;
+ if (*s=='A') { gtag++; ctac++; }
+ ++s;
+ if (*s=='G') gtag++; else if (*s=='C') ctac++;
+ if (gtag>ctac) {
+ type = 'G';
+ numG++;
+ }
+ else if (ctac>gtag) {
+ type = 'C';
+ numC++;
+ }
+ else {
+ type = 'N';
+ numN++;
+ }
+
+ t0->ori = type;
+ }
+ } else {
+ t0->ori = 'E';
+ numE++;
+ }
+ t0 = t1;
+ }
+
+ st->orientation = BOTH;
+
+ if ((numG > 0) && ((numC + numE + numN) == 0)) {
+ st->orientation = FWD;
+ } else if ((numC > 0) && ((numG + numE + numN) == 0)) {
+ st->orientation = BWD;
+ }
+
+ /* code not actually used - sim4cc (-interspecies) currently uses only
+ sync_slide_intron(), but provided here in case that changes */
+ if ((globalParams->_interspecies) && (st->orientation == BOTH)) {
+ if (numG > numC)
+ st->orientation = FWD;
+ if (numG < numC)
+ st->orientation = BWD;
+ }
+
+ if ((globalParams->_forceStrandPrediction) && (st->orientation == BOTH)) {
+ if (numG > numC)
+ st->orientation = FWD;
+ if (numG < numC)
+ st->orientation = BWD;
+
+ // otherwise, st->orientation = match orientation, but we
+ // don't know that here. It's set in sim4string.C:run()
+ }
+
+}
+
+
+/* Removes short (potentially spurious) marginal exons past a long intron
+ or cDNA gap; remove short (potentially spurious) internal exons bordering
+ a cDNA gap; rblock now points to the second to last element in the list */
+
+void
+Sim4::filter(Exon **lblock, Exon **rblock)
+{
+ Exon *t0, *t1, *last;
+ int frontTrim = 0;
+
+ /* NOTE: There must be >=1 non-cap exon on either side of the intron */
+
+ go_front:
+
+ /* At the start... keep t1*/
+ last = NULL;
+ t0 = (*lblock)->next_exon;
+ t1 = t0->next_exon;
+
+ while (t1 && t1->next_exon && (t0->toGEN-t0->frGEN+1<=SHORT_EXON || t0->toEST-t0->frEST+1<=SHORT_EXON)) {
+ if ((t1->frGEN-t0->toGEN>LONG_INTRON) || (t1->frEST-t0->toEST>1))
+ last = t0;
+ t0 = t1; t1 = t1->next_exon;
+ }
+
+ if (last) {
+ /* reset the start of exon list */
+ last = last->next_exon;
+ (*lblock)->next_exon = last;
+
+ /* remove all start exons up to and including 'last' */
+ t0 = (*lblock)->next_exon;
+ while (t0!=last) {
+ t1 = t0;
+ t0 = t0->next_exon;
+// freeExon(t1); LLL 6-17-10 We are doing garbage collection
+ }
+ /* (*rblock) remains unchanged - see NOTE */
+ }
+
+ /* repeat if necessary */
+ if (last) {
+ frontTrim = 1; goto go_front;
+ }
+
+ /* At the end... keep t0 */
+ last = NULL;
+ t0 = frontTrim ? (*lblock) : (*lblock)->next_exon;
+ t1 = t0->next_exon;
+
+ /* NOTE: if all exons are short on both sides of the long intron, then
+ the condition in the loop below will test TRUE and the rest
+ of the exons to the right of the intron will be removed */
+
+ while (t1 && t1->next_exon) {
+ if ((t1->frGEN-t0->toGEN>LONG_INTRON) || (t1->frEST-t0->toEST>1)) {
+ last = t0;
+ while (t1 && t1->next_exon && (t1->toGEN-t1->frGEN+1<=SHORT_EXON || t1->toEST-t1->frEST+1<=SHORT_EXON)) {
+ t0 = t1; t1 = t1->next_exon;
+ }
+ if (t1->next_exon!=NULL) {
+ /* long exon found */
+ t0 = last; t1 = t0->next_exon;
+ last = NULL;
+ }
+ }
+ t0 = t1; t1 = t1->next_exon;
+ }
+
+ if (last) {
+ /* reset the end of exon list */
+ t0 = last->next_exon;
+ last->next_exon = (*rblock)->next_exon;
+ *rblock = last;
+
+ /* remove all end exons, starting with last->next_exon */
+ while (t0!=(*rblock)->next_exon) {
+ t1 = t0;
+ t0 = t0->next_exon;
+// freeExon(t1); LLL 6-17-10 We are doing garbage collection
+ }
+ /* (*lblock) stays unchanged */
+ }
+
+ /* now filter short internal exons nearby cDNA gaps */
+ last = *lblock; t0 = last->next_exon; t1 = t0->next_exon;
+ while (t1 && t1->next_exon) {
+ if (t1->frEST-t0->toEST>1) {
+ if (t0->toGEN-t0->frGEN+1<=SHORT_EXON || t0->toEST-t0->frEST+1<=SHORT_EXON) {
+ /* remove t0 */
+ last->next_exon = t1;
+// freeExon(t0); LLL 6-17-10 We are doing garbage collection
+ t0 = last; /* note: for simplicity, limit removing exons back to one exon */
+ }
+ while (t1->toGEN-t1->frGEN+1<=SHORT_EXON || t1->toEST-t1->frEST+1<=SHORT_EXON) {
+ /* remove t1 */
+ t0->next_exon = t1->next_exon;
+// freeExon(t1); LLL We are doing garbage collection
+ t1 = t0->next_exon;
+ }
+ t0 = t1; t1 = t1->next_exon;
+ } else {
+ last = t0; t0 = t1;
+ t1 = t1->next_exon;
+ }
+ }
+}
+
+
+bool
+Sim4::get_match_quality(Exon *lblock, Exon *rblock, sim4_stats_t *st, int N)
+{
+ int tcov;
+ bool good_match;
+ Exon *t;
+
+ good_match = 1;
+ st->icoverage = 0;
+ t = lblock->next_exon;
+ while (t->toGEN) {
+ st->icoverage += t->toEST-t->frEST+1;
+ if (100*t->edist>=5*(t->toEST-t->frEST+1)) {
+ good_match = 0;
+ break;
+ }
+ t = t->next_exon;
+ }
+ tcov = rblock->toEST-lblock->next_exon->frEST+1;
+ if (lblock->next_exon->frEST>=.5*N &&
+ tcov>=.75*(N-lblock->next_exon->frEST) &&
+ st->icoverage>=MAX(.95*tcov,100))
+ ;
+ else if (rblock->toEST<=.5*N && tcov>=.75*rblock->toEST &&
+ st->icoverage>=MAX(.95*tcov,100))
+ ;
+ else if ((tcov<.75*N) ||
+ (st->icoverage<.9*tcov))
+ good_match = 0;
+
+ return good_match;
+}
+
+
diff --git a/libsim4/sim4polish/Make.include b/libsim4/sim4polish/Make.include
new file mode 100644
index 0000000..2a2b81e
--- /dev/null
+++ b/libsim4/sim4polish/Make.include
@@ -0,0 +1,38 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/))/
+LIBBIO/ :=$(realpath $/../../libbio/))/
+
+src :=$/sim4polish.C \
+ $/sim4polish-copy.C \
+ $/sim4polish-compare.C \
+ $/sim4polish-deleteexon.C \
+ $/sim4polish-exons.C \
+ $/sim4polish-polishtostring.C \
+ $/sim4polish-read.C \
+ $/sim4polish-stringtopolish.C \
+ $/sim4polish-updatescores.C \
+ $/sim4polish.H \
+ $/sim4polishList.H \
+ $/sim4polishList.C \
+ $/sim4polishBuilder.H \
+ $/sim4polishBuilder.C \
+ $/sim4polishFile.H \
+ $/sim4polishFile.C \
+ $/sim4polishReader.C \
+ $/sim4polishReader.H \
+ $/sim4polishWriter.C \
+ $/sim4polishWriter.H
+
+$/.C_SRCS :=$(filter %.c,${src})
+$/.C_INCS :=$(filter %.h,${src})
+$/.CXX_SRCS :=$(filter %.C,${src})
+$/.CXX_INCS :=$(filter %.H,${src})
+$/.CXX_LIBS :=$/libsim4polish.a
+
+$/.CLEAN :=$/*.o
+
+$/libsim4polish.a : ${$/.C_SRCS:.c=.o} ${$/.CXX_SRCS:.C=.o}
+
+$(eval $/%.d $/%.o: CFLAGS +=-I${LIBUTL/} -I${LIBBIO/})
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBUTL/} -I${LIBBIO/})
diff --git a/libsim4/sim4polish/sim4polish-compare.C b/libsim4/sim4polish/sim4polish-compare.C
new file mode 100644
index 0000000..31481b8
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish-compare.C
@@ -0,0 +1,406 @@
+#include "sim4polish.H"
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+//
+// Routines for comparing sim4polish structures.
+//
+// Many of these routines assume that the iid's are consistent for
+// the pair of polishes. In particular, that they are mapped to the
+// same set of genomic sequences.
+//
+
+
+int
+s4p_estIDcompare(const void *a, const void *b) {
+ sim4polish *A = (*(sim4polish **)a);
+ sim4polish *B = (*(sim4polish **)b);
+
+ if (A == 0L) return(1);
+ if (B == 0L) return(-1);
+
+ if (A->_estID < B->_estID) return(-1);
+ if (A->_estID > B->_estID) return(1);
+ if (A->_genID < B->_genID) return(-1);
+ if (A->_genID > B->_genID) return(1);
+ if (A->_exons[0]._genFrom < B->_exons[0]._genFrom) return(-1);
+ if (A->_exons[0]._genFrom > B->_exons[0]._genFrom) return(1);
+
+ return(0);
+}
+
+
+
+int
+s4p_genIDcompare(const void *a, const void *b) {
+ sim4polish *A = (*(sim4polish **)a);
+ sim4polish *B = (*(sim4polish **)b);
+
+ if (A == 0L) return(1);
+ if (B == 0L) return(-1);
+
+ if (A->_genID < B->_genID) return(-1);
+ if (A->_genID > B->_genID) return(1);
+ if (A->_exons[0]._genFrom < B->_exons[0]._genFrom) return(-1);
+ if (A->_exons[0]._genFrom > B->_exons[0]._genFrom) return(1);
+ if (A->_estID < B->_estID) return(-1);
+ if (A->_estID > B->_estID) return(1);
+
+ return(0);
+}
+
+
+
+int
+s4p_estDEFcompare(const void *a, const void *b) {
+ sim4polish *A = (*(sim4polish **)a);
+ sim4polish *B = (*(sim4polish **)b);
+ int e = 0;
+
+ if (A == 0L) return(1);
+ if (B == 0L) return(-1);
+
+ if (A->_estDefLine == 0L) return(1);
+ if (B->_estDefLine == 0L) return(-1);
+ e = strcmp(A->_estDefLine, B->_estDefLine);
+ if (e < 0) return(-1);
+ if (e > 0) return(1);
+
+ if (A->_genDefLine == 0L) return(1);
+ if (B->_genDefLine == 0L) return(-1);
+ e = strcmp(A->_genDefLine, B->_genDefLine);
+ if (e < 0) return(-1);
+ if (e > 0) return(1);
+
+ if (A->_exons[0]._genFrom < B->_exons[0]._genFrom) return(-1);
+ if (A->_exons[0]._genFrom > B->_exons[0]._genFrom) return(1);
+
+ return(0);
+}
+
+
+
+int
+s4p_genDEFcompare(const void *a, const void *b) {
+ sim4polish *A = (*(sim4polish **)a);
+ sim4polish *B = (*(sim4polish **)b);
+ int e = 0;
+
+ if (A == 0L) return(1);
+ if (B == 0L) return(-1);
+
+ if (A->_genDefLine == 0L) return(1);
+ if (B->_genDefLine == 0L) return(-1);
+ e = strcmp(A->_genDefLine, B->_genDefLine);
+ if (e < 0) return(-1);
+ if (e > 0) return(1);
+
+ if (A->_estDefLine == 0L) return(1);
+ if (B->_estDefLine == 0L) return(-1);
+ e = strcmp(A->_estDefLine, B->_estDefLine);
+ if (e < 0) return(-1);
+ if (e > 0) return(1);
+
+ if (A->_exons[0]._genFrom < B->_exons[0]._genFrom) return(-1);
+ if (A->_exons[0]._genFrom > B->_exons[0]._genFrom) return(1);
+
+ return(0);
+}
+
+
+
+
+
+
+
+// Return false if not from the same EST/GEN pair, or mapped to
+// different strands, true otherwise.
+//
+bool
+s4p_compatable(sim4polish *A, sim4polish *B) {
+ if ((A->_estID != B->_estID) ||
+ (A->_genID != B->_genID) ||
+ (A->_matchOrientation != B->_matchOrientation))
+ return(false);
+ else
+ return(true);
+}
+
+
+
+
+// Returns true if the two polishes are on about the same genomic
+// region
+//
+bool
+s4p_IsSameRegion(sim4polish *A, sim4polish *B, int tolerance) {
+ int32 Alo=0, Ahi=0;
+ int32 Blo=0, Bhi=0;
+ int32 Dlo=0, Dhi=0;
+
+ if (A->_numExons > 0) {
+ Alo = (int32)A->_exons[0]._genFrom;
+ Ahi = (int32)A->_exons[A->_numExons-1]._genTo;
+ }
+
+ if (B->_numExons > 0) {
+ Blo = (int32)B->_exons[0]._genFrom;
+ Bhi = (int32)B->_exons[B->_numExons-1]._genTo;
+ }
+
+ Dlo = Blo - Alo;
+ Dhi = Bhi - Ahi;
+
+ if ((Dlo < -tolerance) || (Dlo > tolerance) ||
+ (Dhi < -tolerance) || (Dhi > tolerance))
+ return(false);
+ else
+ return(true);
+}
+
+
+
+// Returns true if the two polishes overlap genomic regions
+//
+bool
+s4p_IsRegionOverlap(sim4polish *A, sim4polish *B) {
+ int32 Alo=0, Ahi=0;
+ int32 Blo=0, Bhi=0;
+
+ if (A->_genID != B->_genID)
+ return(false);
+
+ if (A->_numExons > 0) {
+ Alo = (int32)A->_exons[0]._genFrom;
+ Ahi = (int32)A->_exons[A->_numExons-1]._genTo;
+ }
+
+ if (B->_numExons > 0) {
+ Blo = (int32)B->_exons[0]._genFrom;
+ Bhi = (int32)B->_exons[B->_numExons-1]._genTo;
+ }
+
+ if (((Alo <= Blo) && (Blo <= Ahi)) ||
+ ((Blo <= Alo) && (Alo <= Bhi)))
+ return(true);
+ else
+ return(false);
+}
+
+
+
+// Returns true if the two polishes have the same number of exons,
+// and each exon is mapped to about the same genomic region.
+//
+bool
+s4p_IsSameExonModel(sim4polish *A, sim4polish *B, int tolerance) {
+ int32 Alo=0, Ahi=0;
+ int32 Blo=0, Bhi=0;
+ int32 Dlo=0, Dhi=0;
+
+ if (A->_numExons != B->_numExons)
+ return(0);
+
+ for (uint32 i=0; i<A->_numExons; i++) {
+ Alo = (int32)A->_exons[i]._genFrom;
+ Ahi = (int32)A->_exons[i]._genTo;
+
+ Blo = (int32)B->_exons[i]._genFrom;
+ Bhi = (int32)B->_exons[i]._genTo;
+
+ Dlo = Blo - Alo;
+ Dhi = Bhi - Ahi;
+
+ if ((Dlo < -tolerance) || (Dlo > tolerance) ||
+ (Dhi < -tolerance) || (Dhi > tolerance))
+ return(false);
+ }
+
+ return(true);
+}
+
+
+
+void
+s4p_compareExons_Overlap(sim4polish *A,
+ sim4polish *B,
+ double overlapThreshold,
+ uint32 *numSame,
+ uint32 *numAMissed,
+ uint32 *numBMissed) {
+ uint32 i, j;
+ uint32 al=0, ah=0, bl=0, bh=0;
+ uint32 *foundA = 0L;
+ uint32 *foundB = 0L;
+ double overlap = 0;
+
+ if (numSame) *numSame = 0;
+ if (numAMissed) *numAMissed = 0;
+ if (numBMissed) *numBMissed = 0;
+
+ errno = 0;
+
+ foundA = new uint32 [A->_numExons + B->_numExons];
+ foundB = foundA + A->_numExons;
+
+ if (errno) {
+ fprintf(stderr, "s4p_compareExons()-- Can't allocate "uint32FMT" + "uint32FMT" words for counting exons.\n%s\n", A->_numExons, B->_numExons, strerror(errno));
+ exit(1);
+ }
+
+ for (i=0; i<A->_numExons; i++)
+ foundA[i] = 0;
+
+ for (i=0; i<B->_numExons; i++)
+ foundB[i] = 0;
+
+ // If they overlap, declare a match
+ //
+ for (i=0; i<A->_numExons; i++) {
+ for (j=0; j<B->_numExons; j++) {
+ al = A->_exons[i]._genFrom;
+ ah = A->_exons[i]._genTo;
+ bl = B->_exons[j]._genFrom;
+ bh = B->_exons[j]._genTo;
+
+ overlap = 0;
+
+ // Compute the percent overlapping as:
+ //
+ // ----------
+ // ----------
+ // ^^^ = 3
+ // ^^^^^^^^^^^^^^^^^ = 17
+ //
+ // overlap = 3/17
+ //
+
+ if ((al <= bl) && (bl <= ah)) {
+ // B starts somewhere in A
+ //
+ if (ah < bh) {
+ // B ends outside A
+ //
+ // aaaaaaaaaaa
+ // bbbbbbbbbbbbb
+ overlap = (double)(ah-bl) / (double)(bh-al);
+ } else {
+ // B ends inside A
+ //
+ // aaaaaaaaaaa
+ // bbbbb
+ overlap = (double)(bh-bl) / (double)(ah-al);
+ }
+ }
+ if ((bl <= al) && (al <= bh)) {
+ // B ends somewhere in A
+ //
+ if (bh < ah) {
+ // B starts outside A
+ //
+ // aaaaaaaaaaa
+ // bbbbbbbbbbbbb
+ overlap = (double)(bh-al) / (double)(ah-bl);
+ } else {
+ // B starts inside A
+ //
+ // aaaa
+ // bbbbbbbbbbbbb
+ overlap = (double)(ah-al) / (double)(bh-bl);
+ }
+ }
+
+ if (overlap >= overlapThreshold) {
+ foundA[i]++;
+ foundB[j]++;
+
+ if (numSame)
+ (*numSame)++;
+ }
+ }
+
+ }
+
+ for (i=0; i<A->_numExons; i++) {
+ //if (foundA[i] > 1) fprintf(stderr, "WARNING: Found exon %d %d times in A!\n", i, foundA[i]);
+ if (numAMissed && (foundA[i] == 0))
+ (*numAMissed)++;
+ }
+
+ for (i=0; i<B->_numExons; i++) {
+ //if (foundB[i] > 1) fprintf(stderr, "WARNING: Found exon %d %d times in B!\n", i, foundB[i]);
+ if (numBMissed && (foundB[i] == 0))
+ (*numBMissed)++;
+ }
+
+ delete [] foundA;
+}
+
+
+
+
+
+void
+s4p_compareExons_Ends(sim4polish *A,
+ sim4polish *B,
+ int32 tolerance,
+ uint32 *numSame,
+ uint32 *numAMissed,
+ uint32 *numBMissed) {
+ uint32 i, j;
+ int32 Dlo=0, Dhi=0;
+ uint32 *foundA = 0L;
+ uint32 *foundB = 0L;
+
+ if (numSame) *numSame = 0;
+ if (numAMissed) *numAMissed = 0;
+ if (numBMissed) *numBMissed = 0;
+
+ foundA = new uint32 [A->_numExons + B->_numExons];
+ foundB = foundA + A->_numExons;
+
+ if (errno) {
+ fprintf(stderr, "s4p_compareExons()-- Can't allocate "uint32FMT" + "uint32FMT" words for counting exons.\n%s\n", A->_numExons, B->_numExons, strerror(errno));
+ exit(1);
+ }
+
+ for (i=0; i<A->_numExons; i++)
+ foundA[i] = 0;
+
+ for (i=0; i<B->_numExons; i++)
+ foundB[i] = 0;
+
+ // If they have similar end points, declare a match
+ //
+ for (i=0; i<A->_numExons; i++) {
+ for (j=0; j<B->_numExons; j++) {
+ Dlo = (int32)(B->_exons[j]._genFrom) - (int32)(A->_exons[i]._genFrom);
+ Dhi = (int32)(B->_exons[j]._genTo) - (int32)(A->_exons[i]._genTo);
+
+ if ((Dlo > -tolerance) && (Dlo < tolerance) &&
+ (Dhi > -tolerance) && (Dhi < tolerance)) {
+ foundA[i]++;
+ foundB[j]++;
+
+ if (numSame)
+ (*numSame)++;
+ }
+ }
+ }
+
+ for (i=0; i<A->_numExons; i++) {
+ //if (foundA[i] > 1) fprintf(stderr, "WARNING: Found exon %d %d times in A!\n", i, foundA[i]);
+ if (numAMissed && (foundA[i] == 0))
+ (*numAMissed)++;
+ }
+
+ for (i=0; i<B->_numExons; i++) {
+ //if (foundB[i] > 1) fprintf(stderr, "WARNING: Found exon %d %d times in B!\n", i, foundB[i]);
+ if (numBMissed && (foundB[i] == 0))
+ (*numBMissed)++;
+ }
+
+ delete [] foundA;
+}
diff --git a/libsim4/sim4polish/sim4polish-copy.C b/libsim4/sim4polish/sim4polish-copy.C
new file mode 100644
index 0000000..b4c5bf6
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish-copy.C
@@ -0,0 +1,129 @@
+#include "sim4polish.H"
+#include "memory.h"
+
+#include <errno.h>
+#include <string.h>
+
+
+void
+sim4polishExon::s4p_copyExon(sim4polishExon *orig) {
+
+ if (orig == 0L)
+ return;
+
+ _estFrom = orig->_estFrom;
+ _estTo = orig->_estTo;
+ _genFrom = orig->_genFrom;
+ _genTo = orig->_genTo;
+ _numMatches = orig->_numMatches;
+ _numMatchesN = orig->_numMatchesN;
+ _percentIdentity = orig->_percentIdentity;
+ _intronOrientation = orig->_intronOrientation;
+
+ delete [] _estAlignment;
+ delete [] _genAlignment;
+
+ _estAlignment = NULL;
+ _genAlignment = NULL;
+
+ if (orig->_estAlignment) {
+ uint32 len = strlen(orig->_estAlignment) + 1;
+ _estAlignment = new char [len];
+ memcpy(_estAlignment, orig->_estAlignment, sizeof(char) * len);
+ }
+
+ if (orig->_genAlignment) {
+ uint32 len = strlen(orig->_genAlignment) + 1;
+ _genAlignment = new char [len];
+ memcpy(_genAlignment, orig->_genAlignment, sizeof(char) * len);
+ }
+}
+
+
+void
+sim4polish::s4p_copyPolish(sim4polish *orig, uint32 exonNum) {
+
+ if (orig == 0L)
+ return;
+
+ _estID = orig->_estID;
+ _estLen = orig->_estLen;
+ _estPolyA = orig->_estPolyA;
+ _estPolyT = orig->_estPolyT;
+
+ _genID = orig->_genID;
+ _genRegionOffset = orig->_genRegionOffset;
+ _genRegionLength = orig->_genRegionLength;
+
+ _numMatches = orig->_numMatches;
+ _numMatchesN = orig->_numMatchesN;
+ _numCovered = orig->_numCovered;
+ _percentIdentity = orig->_percentIdentity;
+ _querySeqIdentity = orig->_querySeqIdentity;
+ _matchOrientation = orig->_matchOrientation;
+ _strandOrientation = orig->_strandOrientation;
+
+ delete [] _comment;
+ delete [] _estDefLine;
+ delete [] _genDefLine;
+
+ _comment = NULL;
+ _estDefLine = NULL;
+ _genDefLine = NULL;
+
+ delete [] _exons;
+
+ _numExons = 0;
+ _exons = NULL;
+
+ // Well, that was easy. Onto the deep copy!
+
+ if (orig->_comment) {
+ uint32 len = strlen(orig->_comment) + 1;
+ _comment = new char [len];
+ memcpy(_comment, orig->_comment, sizeof(char) * len);
+ }
+
+ if (orig->_estDefLine) {
+ uint32 len = strlen(orig->_estDefLine) + 1;
+ _estDefLine = new char [len];
+ memcpy(_estDefLine, orig->_estDefLine, sizeof(char) * len);
+ }
+
+ if (orig->_genDefLine) {
+ uint32 len = strlen(orig->_genDefLine) + 1;
+ _genDefLine = new char [len];
+ memcpy(_genDefLine, orig->_genDefLine, sizeof(char) * len);
+ }
+
+ // No exons? We're done here. Should never happen...
+
+ if (orig->_numExons == 0)
+ return;
+
+ // If told to copy one exon, just copy one exon....and then rebuild statistics.
+
+ if (exonNum < orig->_numExons) {
+ _numExons = 1;
+ _exons = new sim4polishExon [_numExons];
+
+ _exons[0].s4p_copyExon(orig->_exons + exonNum);
+
+ // Rebuild stats
+ _numMatches = _exons[0]._numMatches;
+ _numMatchesN = _exons[0]._numMatchesN;
+ _numCovered = _exons[0]._estTo - _exons[0]._estFrom + 1;
+ _percentIdentity = _exons[0]._percentIdentity;
+ _querySeqIdentity = s4p_percentCoverageApprox();
+
+ return;
+ }
+
+ // Otherwise, copy all exons into the new polish
+
+ _numExons = orig->_numExons;
+ _exons = new sim4polishExon [_numExons];
+
+ for (uint32 i=0; i<_numExons; i++)
+ _exons[i].s4p_copyExon(orig->_exons + i);
+}
diff --git a/libsim4/sim4polish/sim4polish-deleteexon.C b/libsim4/sim4polish/sim4polish-deleteexon.C
new file mode 100644
index 0000000..4e32a46
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish-deleteexon.C
@@ -0,0 +1,112 @@
+#include "sim4polish.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+void
+sim4polish::s4p_deleteExon(uint32 a) {
+ char *ed, *gd;
+ int editDistance = 0;
+ int alignmentLength = 0;
+
+ // Warn if we don't have alignments -- this is now done by the
+ // driver (e.g., cleanPolishes.C)
+ //
+#if 0
+ if ((p->exons[0]._estAlignment == 0L) || (p->exons[0]._genAlignment == 0L))
+ fprintf(stderr, "s4p_deleteExon()-- Need alignments to recompute scores correctly!\n");
+#endif
+
+ // Set the intron orientation for the exon before the one we are
+ // deleting:
+ // If we are deleting the first exon, there is no previous exon
+ // If we are deleting the last exon, set the previous to SIM4_INTRON_NONE
+ // Otherwise, set the previous to SIM4_INTRON_GAP
+ //
+ if (_numExons > 1) {
+ if (a == _numExons - 1)
+ _exons[a-1]._intronOrientation = SIM4_INTRON_NONE;
+ else if (a > 0)
+ _exons[a-1]._intronOrientation = SIM4_INTRON_GAP;
+ }
+
+ // Update the match scores
+ //
+ _numMatches -= _exons[a]._numMatches;
+ _numMatchesN -= _exons[a]._numMatchesN;
+
+ // Erase the exon we're removing, but save a copy so we can stash it in the
+ // soon-to-be-emptied last location.
+ //
+ _exons[a].s4p_clearExon();
+
+ sim4polishExon d = _exons[a];
+
+ // Shift all the exons down by one, and decrement the number of
+ // exons present in the list.
+ //
+ for (uint32 i=a+1; i<_numExons; i++)
+ _exons[i-1] = _exons[i];
+
+ _numExons--;
+
+ // Stash the now deleted exon in the last spot, just to clear out the old contents.
+ //
+ _exons[_numExons] = d;
+
+ // The strand orientation becomes unknown if we delete internal
+ // exons, or we end up with only one exon.
+ //
+ if (((0 < a) && (a < _numExons)) ||
+ (_numExons == 1))
+ _strandOrientation = SIM4_STRAND_UNKNOWN;
+
+
+ // Compute the alignment length and the number of edits.
+ //
+ alignmentLength = 0;
+ editDistance = 0;
+
+ _numCovered = 0;
+
+ for (uint32 i=0; i<_numExons; i++) {
+ ed = _exons[i]._estAlignment;
+ gd = _exons[i]._genAlignment;
+
+ if (ed && gd) {
+ alignmentLength += 2 * strlen(ed);
+ for (; *ed && *gd; ed++, gd++) {
+ if (*ed != *gd)
+ editDistance++;
+ }
+ } else {
+ int len = _exons[i]._estTo - _exons[i]._estFrom + 1 + _exons[i]._estTo - _exons[i]._estFrom + 1;
+
+ alignmentLength += len;
+ editDistance += len / 2 - _exons[i]._numMatches - _exons[i]._numMatchesN;
+ }
+
+ _numCovered += _exons[i]._genTo - _exons[i]._genFrom + 1;
+ }
+
+#if 0
+ fprintf(stdout, "Found (new)alignLen = %d\n", alignmentLength);
+ fprintf(stdout, "Found (new)editDist = %d\n", editDistance);
+#endif
+
+ // Fix the scores for the match. Special case; if there is only
+ // one exon left, the score for the exon is the score for the
+ // match.
+ //
+ if (_numExons == 1)
+ _percentIdentity = _exons[0]._percentIdentity;
+ else
+ _percentIdentity = s4p_percentIdentityApprox(editDistance, alignmentLength);
+
+ // Update the query sequence identity
+ //
+ _querySeqIdentity = s4p_percentCoverageApprox();
+}
+
diff --git a/libsim4/sim4polish/sim4polish-exons.C b/libsim4/sim4polish/sim4polish-exons.C
new file mode 100644
index 0000000..53013c3
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish-exons.C
@@ -0,0 +1,67 @@
+#include "sim4polish.H"
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include "memory.h"
+
+void
+sim4polish::s4p_swapExons(uint32 a, uint32 b) {
+ sim4polishExon copyofa = _exons[a];
+
+ _exons[a] = _exons[b];
+ _exons[b] = copyofa;
+}
+
+
+// Insert a single exon into the list at position a
+void
+sim4polish::s4p_insertExon(uint32 a, uint32 intronori, sim4polishExon *e) {
+ sim4polish p;
+
+ p._numExons = 1;
+ p._exons = e;
+
+ s4p_insertExons(a, intronori, &p);
+}
+
+
+
+// Inserts all the exons in e into the list at position a.
+void
+sim4polish::s4p_insertExons(uint32 a, uint32 intronori, sim4polish *e) {
+ sim4polishExon *ne = new sim4polishExon [_numExons + e->_numExons];
+
+ // Copy exons up to the insert point.
+
+ for (uint32 i=0; i<a; i++) {
+ ne[i] = _exons[i];
+ _exons[i].s4p_clearExon();
+ }
+
+ // Insert the new ones. We don't own them, so can't assume anything about the alignment strings.
+
+ for (uint32 i=0; i<e->_numExons; i++)
+ ne[a+i].s4p_copyExon(e->_exons+i);
+
+ // Copy the rest.
+
+ for (uint32 i=a; i<_numExons; i++) {
+ ne[i+e->_numExons] = _exons[i];
+ _exons[i].s4p_clearExon();
+ }
+
+ // All done with the copy, get rid of the old stuff. s4p_clearExon() above is critical here;
+ // without it we would delete the alignment strings.
+
+ delete [] _exons;
+ _exons = ne;
+
+ _numExons += e->_numExons;
+
+ // We trust that the user has set the intron orientation in the new exon, and that 'intronori' is
+ // the correct orientation for the previous intron.
+ //
+ if (a > 0)
+ _exons[a-1]._intronOrientation = intronori;
+}
diff --git a/libsim4/sim4polish/sim4polish-polishtostring.C b/libsim4/sim4polish/sim4polish-polishtostring.C
new file mode 100644
index 0000000..1ebec63
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish-polishtostring.C
@@ -0,0 +1,403 @@
+#include "sim4polish.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <assert.h>
+
+//#define DEBUG_CIGAR
+
+const char *mOriFWD = "forward";
+const char *mOriCMP = "complement";
+const char *mOriERR = "error";
+const char *mOriDEF = "UNKNOWN";
+
+const char *sOriFWD = "forward";
+const char *sOriREV = "reverse";
+const char *sOriUNK = "unknown";
+const char *sOriINT = "intractable";
+const char *sOriABT = "aborted";
+const char *sOriERR = "error";
+const char *sOriDEF = "UNKNOWN";
+
+const char *iOriPOS = " ->";
+const char *iOriNEG = " <-";
+const char *iOriAMB = " --";
+const char *iOriGAP = " ==";
+const char *iOriERR = " ??";
+const char *iOriNOO = "";
+
+
+bool sim4polishStyleSet = false;
+sim4polishStyle sim4polishStyleDefault = sim4polishS4DB;
+uint32 sim4polishPolishID = 0;
+
+
+char *
+encodeGap(char *ref, char *tgt) {
+
+ if ((ref == 0L) || (tgt == 0L))
+ return(0L);
+
+ uint32 lenref = strlen(ref);
+ uint32 lentgt = strlen(tgt);
+
+ assert(lenref == lentgt);
+
+ char *gap = new char [3 * lenref];
+ char *gpp = gap;
+
+ char gaptyp = 0;
+ uint32 gapcnt = 0;
+
+ for (uint32 i=0; i<lenref; i++) {
+ if ((ref[i] == '-') && (tgt[i] != '-')) {
+ if (gaptyp != 'I') {
+ if (gaptyp != 0) {
+ sprintf(gpp, "%c"uint32FMT" ", gaptyp, gapcnt);
+ while (*gpp) gpp++;
+ }
+ gaptyp = 'I';
+ gapcnt = 0;
+ }
+ gapcnt++;
+ } else if ((ref[i] != '-') && (tgt[i] == '-')) {
+ if (gaptyp != 'D') {
+ if (gaptyp != 0) {
+ sprintf(gpp, "%c"uint32FMT" ", gaptyp, gapcnt);
+ while (*gpp) gpp++;
+ }
+ gaptyp = 'D';
+ gapcnt = 0;
+ }
+ gapcnt++;
+ } else if ((ref[i] == '-') && (tgt[i] == '-')) {
+ assert(0);
+ } else {
+ if (gaptyp != 'M') {
+ if (gaptyp != 0) {
+ sprintf(gpp, "%c"uint32FMT" ", gaptyp, gapcnt);
+ while (*gpp) gpp++;
+ }
+ gaptyp = 'M';
+ gapcnt = 0;
+ }
+ gapcnt++;
+ }
+ }
+
+ if (gaptyp != 0) {
+ sprintf(gpp, "%c"uint32FMT"", gaptyp, gapcnt);
+ while (*gpp) gpp++;
+ }
+
+#ifdef DEBUG_CIGAR
+ fprintf(stderr, "REF=%s\n", ref);
+ fprintf(stderr, "TGT=%s\n", tgt);
+ fprintf(stderr, "GAP=%s\n", gap);
+ fprintf(stderr, "---\n");
+#endif
+
+ return(gap);
+}
+
+
+
+
+
+char *
+sim4polish::s4p_polishToString(sim4polishStyle style) {
+ char *ret = NULL;
+
+ if (_numExons == 0)
+ return(ret);
+
+ switch (style) {
+ case sim4polishS4DB:
+ ret = s4p_polishToStringS4DB();
+ break;
+ case sim4polishGFF3:
+ ret = s4p_polishToStringGFF3();
+ break;
+ case sim4polishATAC:
+ ret = s4p_polishToStringATAC();
+ break;
+ default:
+ fprintf(stderr, "s4p_polishToString()-- unknown style='%d'\n",
+ style);
+ exit(1);
+ }
+
+ return(ret);
+}
+
+
+
+
+char *
+sim4polish::s4p_polishToStringS4DB(void) {
+ const char *mOri = mOriDEF;
+ const char *sOri = sOriDEF;
+ const char *iOri = iOriERR;
+
+ // Make a decent estimate of how much space we'll need to store the string
+ //
+ uint32 spaceNeeded = (1024 + 128 * _numExons +
+ ((_comment) ? strlen(_comment) : 0) +
+ ((_estDefLine) ? strlen(_estDefLine) : 0) +
+ ((_genDefLine) ? strlen(_genDefLine) : 0));
+
+ for (uint32 i=0; i<_numExons; i++)
+ if (_exons[i]._estAlignment)
+ spaceNeeded += 2 * strlen(_exons[i]._estAlignment);
+
+ char *outs = new char [spaceNeeded];
+ char *outc = outs;
+
+ switch (_matchOrientation) {
+ case SIM4_MATCH_FORWARD: mOri = mOriFWD; break;
+ case SIM4_MATCH_COMPLEMENT: mOri = mOriCMP; break;
+ case SIM4_MATCH_ERROR: mOri = mOriERR; break;
+ default:
+ fprintf(stderr, "sim4reader: Unknown matchOrientation '"uint32FMT"' in printPolish()\n", _matchOrientation);
+ mOri = mOriDEF;
+ break;
+ }
+
+ switch (_strandOrientation) {
+ case SIM4_STRAND_POSITIVE: sOri = sOriFWD; break;
+ case SIM4_STRAND_NEGATIVE: sOri = sOriREV; break;
+ case SIM4_STRAND_UNKNOWN: sOri = sOriUNK; break;
+ case SIM4_STRAND_INTRACTABLE: sOri = sOriINT; break;
+ case SIM4_STRAND_FAILED: sOri = sOriABT; break;
+ case SIM4_STRAND_ERROR: sOri = sOriERR; break;
+ default:
+ fprintf(stderr, "sim4reader: Unknown strandOrientation '"uint32FMT"' in printPolish()\n", _matchOrientation);
+ sOri = sOriDEF;
+ break;
+ }
+
+ sprintf(outc, "sim4begin\n"uint32FMT"["uint32FMT"-"uint32FMT"-"uint32FMT"] "uint32FMT"["uint32FMT"-"uint32FMT"] <"uint32FMT"-"uint32FMT"-"uint32FMT"-%s-%s>\n",
+ _estID, _estLen, _estPolyA, _estPolyT,
+ _genID, _genRegionOffset, _genRegionLength,
+ _numMatches, _numMatchesN, _percentIdentity, mOri, sOri);
+ while (*outc) outc++;
+
+ if (_comment) {
+ sprintf(outc, "comment=%s\n", _comment);
+ while (*outc) outc++;
+ }
+
+ if (_estDefLine) {
+ sprintf(outc, "edef=%s\n", _estDefLine);
+ while (*outc) outc++;
+ }
+
+ if (_genDefLine) {
+ sprintf(outc, "ddef=%s\n", _genDefLine);
+ while (*outc) outc++;
+ }
+
+ for (uint32 i=0; i<_numExons; i++) {
+ switch (_exons[i]._intronOrientation) {
+ case SIM4_INTRON_POSITIVE: iOri = iOriPOS; break;
+ case SIM4_INTRON_NEGATIVE: iOri = iOriNEG; break;
+ case SIM4_INTRON_AMBIGUOUS: iOri = iOriAMB; break;
+ case SIM4_INTRON_GAP: iOri = iOriGAP; break;
+ case SIM4_INTRON_ERROR: iOri = iOriERR; break;
+ default: iOri = iOriNOO; break;
+ }
+
+ sprintf(outc, ""uint32FMT"-"uint32FMT" ("uint32FMT"-"uint32FMT") <"uint32FMT"-"uint32FMT"-"uint32FMT">%s\n",
+ _exons[i]._estFrom, _exons[i]._estTo,
+ _exons[i]._genFrom, _exons[i]._genTo,
+ _exons[i]._numMatches, _exons[i]._numMatchesN, _exons[i]._percentIdentity, iOri);
+
+ while (*outc) outc++;
+ }
+
+ for (uint32 i=0; i<_numExons; i++) {
+ if (_exons[i]._estAlignment) {
+ strcpy(outc, _exons[i]._estAlignment);
+ while (*outc) outc++;
+ *outc++ = '\n';
+ }
+ if (_exons[i]._genAlignment) {
+ strcpy(outc, _exons[i]._genAlignment);
+ while (*outc) outc++;
+ *outc++ = '\n';
+ }
+ }
+
+ strcpy(outc, "sim4end\n");
+
+ return(outs);
+}
+
+char *
+sim4polish::s4p_polishToStringGFF3(void) {
+
+ // 9 columns, tab separated
+ // tab, newline, cr and control MUST be escaped
+ // reserved letters: ; = % & ,
+ // spaces ARE ALLOWED in fields
+ // undefined values should use '.'
+ //
+ // 1 seqid, genome name (a-zA-Z0-9.:^*$@!+_?-|), no whitespace (??) and not begin with >
+ // 2 source ("sim4db")
+ // 3 type ("mRNA" or "exon")
+ // 4 begin, 1-based
+ // 5 end, zero-length start=end, to the right of this base
+ // 6 score (percent identity)
+ // 7 strand
+ // 8 phase
+ // 9 attributes
+ // ID (unique within scope of file)
+ // Name (display name)
+ // Parent ()
+ // Target
+ // Gap
+ // Derives_from
+ // Note
+ // Dbxref
+ // Ontology_term
+ // Is_circular
+ // others, user-defined (lowercase first letter; see below)
+ //
+ // Example:
+ // 0:arm_2L sim4db mRNA 2372455 2373234 98 - . ID=sim4db0;Name=61728:gb|CA807305;Target=61728:gb|CA807305 22 685 +;targetLen=685;pA=0;pT=21;genRegion=2370482-2375223
+ // 0:arm_2L sim4db exon 2372455 2372770 99 - . Parent=sim4db0;Target=61728:gb|CA807305 22 337 +;Gap=M316;nMatches=313;intron=<-
+ // 0:arm_2L sim4db exon 2372830 2373076 96 - . Parent=sim4db0;Target=61728:gb|CA807305 338 584 +;Gap=M74 D1 M2 I1 M170;nMatches=238;intron=<-
+ // 0:arm_2L sim4db exon 2373134 2373234 99 - . Parent=sim4db0;Target=61728:gb|CA807305 585 685 +;Gap=M101;nMatches=100
+ //
+
+ // Make a decent estimate of how much space we'll need to store the string
+ //
+ uint32 spaceNeeded = (1024 + 128 * _numExons +
+ ((_comment) ? strlen(_comment) : 0) +
+ ((_estDefLine) ? strlen(_estDefLine) : 0) +
+ ((_genDefLine) ? strlen(_genDefLine) : 0));
+
+ for (uint32 i=0; i<_numExons; i++)
+ if (_exons[i]._estAlignment)
+ spaceNeeded += 2 * strlen(_exons[i]._estAlignment);
+
+ char *outs = new char [spaceNeeded];
+ char *outc = outs;
+
+ // Find extents of this match.
+ uint32 estbgn = _exons[0]._estFrom;
+ uint32 estend = _exons[_numExons-1]._estTo;
+ uint32 genbgn = _exons[0]._genFrom;
+ uint32 genend = _exons[_numExons-1]._genTo;
+
+ for (uint32 i=0; i<_numExons; i++) {
+ if (_exons[i]._genFrom < genbgn) genbgn = _exons[i]._genFrom;
+ if (_exons[i]._genTo < genbgn) genbgn = _exons[i]._genTo;
+ if (genend < _exons[i]._genFrom) genend = _exons[i]._genFrom;
+ if (genend < _exons[i]._genTo) genend = _exons[i]._genTo;
+
+ if (_exons[i]._estFrom < estbgn) estbgn = _exons[i]._estFrom;
+ if (_exons[i]._estTo < estbgn) estbgn = _exons[i]._estTo;
+ if (estend < _exons[i]._estFrom) estend = _exons[i]._estFrom;
+ if (estend < _exons[i]._estTo) estend = _exons[i]._estTo;
+ }
+
+ // Find the orientation
+ char mOri = '?';
+
+ if (_matchOrientation == SIM4_MATCH_FORWARD) mOri = '+';
+ if (_matchOrientation == SIM4_MATCH_COMPLEMENT) mOri = '-';
+
+ // Find the strand
+ char sOri = '?';
+ switch (_strandOrientation) {
+ case SIM4_STRAND_POSITIVE: sOri = '+'; break;
+ case SIM4_STRAND_NEGATIVE: sOri = '-'; break;
+ case SIM4_STRAND_UNKNOWN:
+ case SIM4_STRAND_INTRACTABLE:
+ case SIM4_STRAND_FAILED:
+ case SIM4_STRAND_ERROR: sOri = '.'; break;
+ default:
+ fprintf(stderr, "sim4reader: Unknown strandOrientation '"uint32FMT"' in printPolishGFF3()\n", _matchOrientation);
+ sOri = '.';
+ break;
+ }
+
+ // Get rid of spaces in the names (and do it non-destructively).
+
+ uint32 estDefSpace = 0;
+ uint32 genDefSpace = 0;
+
+ while ((_estDefLine[estDefSpace]) && (isspace(_estDefLine[estDefSpace]) == 0))
+ estDefSpace++;
+ while ((_genDefLine[genDefSpace]) && (isspace(_genDefLine[genDefSpace]) == 0))
+ genDefSpace++;
+
+ char estDefChar = _estDefLine[estDefSpace];
+ char genDefChar = _genDefLine[genDefSpace];
+
+ _estDefLine[estDefSpace] = 0;
+ _genDefLine[genDefSpace] = 0;
+
+ // The main mRNA match line.
+
+ sprintf(outc, uint32FMT":%s\tsim4db\tmRNA\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%c\t.\t",
+ _genID, _genDefLine, genbgn, genend, _percentIdentity, sOri);
+ while (*outc) outc++;
+
+ sprintf(outc, "ID=sim4db"uint32FMT";Name="uint32FMT":%s;Target="uint32FMT":%s "uint32FMT" "uint32FMT" %c;",
+ sim4polishPolishID, _estID, _estDefLine, _estID, _estDefLine, estbgn, estend, mOri);
+ while (*outc) outc++;
+
+ sprintf(outc, "targetLen="uint32FMT";pA="uint32FMT";pT="uint32FMT";genRegion="uint32FMT"-"uint32FMT"\n",
+ _estLen, _estPolyA, _estPolyT, _genRegionOffset, _genRegionOffset + _genRegionLength -1);
+ while (*outc) outc++;
+
+ // Exons.
+
+ for (uint32 i=0; i<_numExons; i++) {
+ char *gap = encodeGap(_exons[i]._genAlignment, _exons[i]._estAlignment);
+
+ sprintf(outc, uint32FMT":%s\tsim4db\texon\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%c\t.\t",
+ _genID, _genDefLine, _exons[i]._genFrom, _exons[i]._genTo, _exons[i]._percentIdentity, sOri);
+ while (*outc) outc++;
+
+ if (gap)
+ sprintf(outc, "Parent=sim4db"uint32FMT";Target="uint32FMT":%s "uint32FMT" "uint32FMT" %c;Gap=%s;nMatches="uint32FMT"",
+ sim4polishPolishID, _estID, _estDefLine, _exons[i]._estFrom, _exons[i]._estTo, mOri, gap, _exons[i]._numMatches);
+ else
+ sprintf(outc, "Parent=sim4db"uint32FMT";Target="uint32FMT":%s "uint32FMT" "uint32FMT" %c;nMatches="uint32FMT"",
+ sim4polishPolishID, _estID, _estDefLine, _exons[i]._estFrom, _exons[i]._estTo, mOri, _exons[i]._numMatches);
+ while (*outc) outc++;
+
+ delete [] gap;
+
+ switch (_exons[i]._intronOrientation) {
+ // +1 to exclude the front blank space
+ case SIM4_INTRON_POSITIVE: sprintf(outc, ";intron=%s\n", iOriPOS +1); break;
+ case SIM4_INTRON_NEGATIVE: sprintf(outc, ";intron=%s\n", iOriNEG +1); break;
+ case SIM4_INTRON_AMBIGUOUS: sprintf(outc, ";intron=%s\n", iOriAMB +1); break;
+ case SIM4_INTRON_GAP: sprintf(outc, ";intron=%s\n", iOriGAP +1); break;
+ case SIM4_INTRON_ERROR: sprintf(outc, ";intron=%s\n", iOriERR +1); break;
+ default: sprintf(outc, "\n"); break;
+ }
+
+ while (*outc) outc++;
+ }
+
+ sim4polishPolishID++;
+
+ _estDefLine[estDefSpace] = estDefChar;
+ _genDefLine[genDefSpace] = genDefChar;
+
+ return(outs);
+}
+
+char *
+sim4polish::s4p_polishToStringATAC(void) {
+ return(0L);
+}
+
+
diff --git a/libsim4/sim4polish/sim4polish-read.C b/libsim4/sim4polish/sim4polish-read.C
new file mode 100644
index 0000000..0afb850
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish-read.C
@@ -0,0 +1,181 @@
+#include "sim4polish.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <math.h>
+#include <assert.h>
+
+
+void
+sim4polish::s4p_readPolishS4DB(readBuffer *rb) {
+
+ // Clear this polish.
+
+ _numExons = 0;
+
+ delete [] _comment; _comment = 0L;
+ delete [] _estDefLine; _estDefLine = 0L;
+ delete [] _genDefLine; _genDefLine = 0L;
+ delete [] _exons; _exons = 0L;
+
+ // Decide the type of record we're reading.
+
+ // Read it.
+
+ uint64 startPosition = rb->tell();
+
+ uint64 thisLineMax = 1048576;
+ uint64 thisLineLen = 0;
+ char *thisLine = new char [thisLineMax];
+
+ uint32 numLines = 10240;
+ uint32 curLine = 0;
+
+ char **lines = new char * [numLines + 1];
+ uint32 *lengths = new uint32 [numLines + 1];
+
+ memset(lines, 0, sizeof(char *) * numLines);
+ memset(lengths, 0, sizeof(uint32) * numLines);
+
+ thisLineLen = rb->read(thisLine, thisLineMax, '\n');
+ chompL(thisLine, thisLineLen);
+
+ while (!rb->eof() && strcmp(thisLine, "sim4begin")) {
+ fprintf(stderr, "sim4reader: Got '%s', expecting 'sim4begin' at byte "uint64FMT"\n",
+ thisLine, startPosition);
+ thisLineLen = rb->read(thisLine, thisLineMax, '\n');
+ chompL(thisLine, thisLineLen);
+ }
+
+ // Stash the 'sim4begin' line into the lines array.
+ lines[curLine] = new char [thisLineLen + 1];
+ lengths[curLine] = thisLineLen;
+ memcpy(lines[curLine++], thisLine, sizeof(char) * (thisLineLen + 1));
+
+ // Until we hit 'sim4end' stash lines into lines. Yes, we test the previous line, then read the
+ // next. At the end of the loop, we'll read 'sim4end', stash it in lines[], then test.
+
+ while (!rb->eof() && strcmp(thisLine, "sim4end")) {
+ thisLineLen = rb->read(thisLine, thisLineMax, '\n');
+ chompL(thisLine, thisLineLen);
+
+ if (curLine >= numLines) {
+#warning LAZY PROGRAMMER did not extend an array
+ fprintf(stderr, "ERROR: too many lines, lazy programmer.\n");
+ exit(1);
+ }
+
+ // Stash the line in the lines array.
+ lines[curLine] = new char [thisLineLen + 1];
+ lengths[curLine] = thisLineLen;
+ memcpy(lines[curLine++], thisLine, sizeof(char) * (thisLineLen + 1));
+ }
+
+ delete [] thisLine;
+
+ if (numLines > 0)
+ s4p_linesToPolishS4DB(startPosition, numLines, lines, lengths);
+
+ for (uint32 i=0; i<curLine; i++)
+ delete [] lines[i];
+
+ delete [] lines;
+ delete [] lengths;
+}
+
+
+
+void
+sim4polish::s4p_readPolishGFF3(readBuffer *rb) {
+ // Clear this polish.
+
+ _numExons = 0;
+
+ delete [] _comment; _comment = 0L;
+ delete [] _estDefLine; _estDefLine = 0L;
+ delete [] _genDefLine; _genDefLine = 0L;
+ delete [] _exons; _exons = 0L;
+
+ // Decide the type of record we're reading.
+
+ // Read it.
+ uint64 startPosition = rb->tell();
+
+ uint64 thisLineMax = 1048576;
+ uint64 thisLineLen = 0;
+ char *thisLine = new char [thisLineMax];
+
+ uint32 numLines = 10240;
+ uint32 curLine = 0;
+
+ bool firstLine = true;
+
+ char **lines = new char * [numLines + 1];
+ uint32 *lengths = new uint32 [numLines + 1];
+
+ memset(lines, 0, sizeof(char *) * numLines);
+ memset(lengths, 0, sizeof(uint32) * numLines);
+
+ thisLineLen = rb->read(thisLine, thisLineMax, '\n');
+ chompL(thisLine, thisLineLen);
+
+ while (!rb->eof() && (!strstr(thisLine, "\tsim4db\tmRNA") || (thisLine[0]=='#'))) {
+ if (thisLine[0]!='#')
+ fprintf(stderr, "sim4reader: Got '%s', expecting GFF3 mRNA line at byte "uint64FMT"\n",
+ thisLine, startPosition);
+ thisLineLen = rb->read(thisLine, thisLineMax, '\n');
+ chompL(thisLine, thisLineLen);
+ }
+
+ // Check the mRNA line (!), then stash into the lines array.
+ lines[curLine] = new char [thisLineLen + 1];
+ lengths[curLine] = thisLineLen;
+ memcpy(lines[curLine++], thisLine, sizeof(char) * (thisLineLen + 1));
+
+ // Read the GFF3 record, till the next mRNA line.
+ // We expect 'intron' on each exon line but the last; until we hit an intron-less line,
+ // stash lines into lines. Yes, we test the previous line, then read the next.
+ // At the end of the loop, we'll read the intron-less line, stash it in lines[], then test.
+
+ while (!rb->eof() && (firstLine || strstr(thisLine, "\tsim4db\texon\t"))) {
+
+ if ((firstLine == false) && !strstr(thisLine, "intron=")) break;
+
+ thisLineLen = rb->read(thisLine, thisLineMax, '\n');
+ chompL(thisLine, thisLineLen);
+
+ if (curLine >= numLines) {
+#warning LAZY PROGRAMMER did not extend an array
+ fprintf(stderr, "ERROR: too many lines, lazy programmer.\n");
+ exit(1);
+ }
+
+ // If not a comment, stash the line in the lines array
+ if (thisLine[0] == '#') continue;
+
+ lines[curLine] = new char [thisLineLen + 1];
+ lengths[curLine] = thisLineLen;
+ memcpy(lines[curLine++], thisLine, sizeof(char) * (thisLineLen + 1));
+
+ firstLine = false;
+ }
+
+ delete [] thisLine;
+
+ if (curLine > 0)
+ s4p_linesToPolishGFF3(startPosition, numLines, lines, lengths);
+
+ for (uint32 i=0; i<curLine; i++)
+ delete [] lines[i];
+
+ delete [] lines;
+ delete [] lengths;
+}
+
+
+
+void
+sim4polish::s4p_readPolishATAC(readBuffer *rb) {
+}
diff --git a/libsim4/sim4polish/sim4polish-stringtopolish.C b/libsim4/sim4polish/sim4polish-stringtopolish.C
new file mode 100644
index 0000000..157011a
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish-stringtopolish.C
@@ -0,0 +1,444 @@
+#include "sim4polish.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <math.h>
+
+#include <assert.h>
+
+void
+sim4polish::s4p_linesToPolishS4DB(uint32 startPosition,
+ uint32 maxLines,
+ char **lines,
+ uint32 *lengths) {
+ char mOri[65];
+ char sOri[65];
+
+ assert(_comment == 0L);
+ assert(_estDefLine == 0L);
+ assert(_genDefLine == 0L);
+ assert(_exons == 0L);
+ assert(_numExons == 0);
+
+ if (strcmp(lines[0], "sim4begin")) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishS4DB()-- Invalid sim4db format, got '%s' instead of sim4begin. Cannot convert.\n",
+ lines[0]);
+ return;
+ }
+
+ uint32 cl = 1;
+
+ // Convert '-' into ' ', on the assumption that this is the description line. This allows us to
+ // use scanf properly.
+ //
+ for (uint32 i=0; i<lengths[cl]; i++)
+ if (lines[cl][i] == '-')
+ lines[cl][i] = ' ';
+
+ mOri[0] = 0;
+ sOri[0] = 0;
+ uint32 r = sscanf(lines[cl], ""uint32FMT"["uint32FMT" "uint32FMT" "uint32FMT"] "uint32FMT"["uint32FMT" "uint32FMT"] <"uint32FMT" "uint32FMT" "uint32FMT" %s %s>",
+ &_estID,
+ &_estLen,
+ &_estPolyA,
+ &_estPolyT,
+ &_genID,
+ &_genRegionOffset,
+ &_genRegionLength,
+ &_numMatches,
+ &_numMatchesN,
+ &_percentIdentity,
+ mOri, sOri);
+ if (r != 12) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishS4DB()-- byte "uint32FMT": '%s'\n", startPosition, lines[cl]);
+ fprintf(stderr, "sim4polish::s4p_linesToPolishS4DB()-- Expecting description line, found %d tokens instead of 12.\n", r);
+ }
+
+ switch (mOri[0]) {
+ case 'f':
+ _matchOrientation = SIM4_MATCH_FORWARD;
+ break;
+ case 'c':
+ _matchOrientation = SIM4_MATCH_COMPLEMENT;
+ break;
+ case 'r':
+ // BUG FIX -- old version of sim4 used "reverse-intractable"
+ // instead of "complement-intractable"
+ _matchOrientation = SIM4_MATCH_COMPLEMENT;
+ break;
+ default:
+ fprintf(stderr, "sim4polish::s4p_linesToPolishS4DB()-- byte "uint32FMT": '%s'\n", startPosition, lines[cl]);
+ fprintf(stderr, "sim4polish::s4p_linesToPolishS4DB()-- unknown match orientation\n");
+ break;
+ }
+
+ switch (sOri[2]) {
+ case 'r': _strandOrientation = SIM4_STRAND_POSITIVE; break;
+ case 'v': _strandOrientation = SIM4_STRAND_NEGATIVE; break;
+ case 'k': _strandOrientation = SIM4_STRAND_UNKNOWN; break;
+ case 't': _strandOrientation = SIM4_STRAND_INTRACTABLE; break;
+ case 'i': _strandOrientation = SIM4_STRAND_FAILED; break;
+ default:
+ fprintf(stderr, "sim4polish::s4p_linesToPolishS4DB()-- byte "uint32FMT": '%s'\n", startPosition, lines[cl]);
+ fprintf(stderr, "sim4polish::s4p_linesToPolishS4DB()-- unknown strand orientation\n");
+ break;
+ }
+
+ cl++;
+
+ _comment = 0L;
+ if (strncmp(lines[cl], "comment", 7) == 0) {
+ _comment = new char [lengths[cl] - 7];
+ strcpy(_comment, lines[cl] + 8);
+
+ cl++;
+ }
+
+ _estDefLine = 0L;
+ if (strncmp(lines[cl], "edef", 4) == 0) {
+ _estDefLine = new char [lengths[cl] - 4];
+ strcpy(_estDefLine, lines[cl] + 5);
+
+ cl++;
+ }
+
+ _genDefLine = 0L;
+ if (strncmp(lines[cl], "ddef", 4) == 0) {
+ _genDefLine = new char [lengths[cl] - 4];
+ strcpy(_genDefLine, lines[cl] + 5);
+
+ cl++;
+ }
+
+ //
+ // While we get exons, make exons.
+ //
+
+ sim4polishExon exon;
+ uint32 maxExons = 1024;
+
+ _numExons = 0;
+ _exons = new sim4polishExon [maxExons];
+
+ _numCovered = 0;
+
+ while (sscanf(lines[cl], ""uint32FMT"-"uint32FMT" ("uint32FMT"-"uint32FMT") <"uint32FMT"-"uint32FMT"-"uint32FMT">",
+ &exon._estFrom, &exon._estTo,
+ &exon._genFrom, &exon._genTo,
+ &exon._numMatches,
+ &exon._numMatchesN,
+ &exon._percentIdentity) == 7) {
+
+ // Dang, out of space! This would be a chore, except we don't have alignments yet, and so can
+ // get by with a shallow copy.
+ //
+ if (_numExons >= maxExons) {
+ maxExons *= 2;
+ sim4polishExon *nnn = new sim4polishExon [maxExons];
+ memcpy(nnn, _exons, sizeof(sim4polishExon) * _numExons);
+ delete [] _exons;
+ _exons = nnn;
+ }
+
+ _exons[_numExons] = exon;
+
+ _exons[_numExons]._intronOrientation = SIM4_INTRON_NONE;
+
+ if ((lines[cl][lengths[cl]-2] == '-') && (lines[cl][lengths[cl]-1] == '>'))
+ _exons[_numExons]._intronOrientation = SIM4_INTRON_POSITIVE;
+ if ((lines[cl][lengths[cl]-2] == '<') && (lines[cl][lengths[cl]-1] == '-'))
+ _exons[_numExons]._intronOrientation = SIM4_INTRON_NEGATIVE;
+ if ((lines[cl][lengths[cl]-2] == '-') && (lines[cl][lengths[cl]-1] == '-'))
+ _exons[_numExons]._intronOrientation = SIM4_INTRON_AMBIGUOUS;
+ if ((lines[cl][lengths[cl]-2] == '=') && (lines[cl][lengths[cl]-1] == '='))
+ _exons[_numExons]._intronOrientation = SIM4_INTRON_GAP;
+
+ _exons[_numExons]._estAlignment = 0L;
+ _exons[_numExons]._genAlignment = 0L;
+
+ _numCovered += _exons[_numExons]._estTo - _exons[_numExons]._estFrom + 1;
+
+ _numExons++;
+
+ cl++;
+ }
+
+ if (_numExons == 0) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishS4DB()-- byte "uint32FMT": '%s'\n", startPosition, lines[cl]);
+ fprintf(stderr, "sim4polish::s4p_linesToPolishS4DB()-- WARNING: found ZERO exons?\n");
+ }
+
+ _querySeqIdentity = s4p_percentCoverageApprox();
+
+ // Now, if we are not at 'sim4end', assume that there are alignment lines for each exon.
+ //
+ // We used to check that we didn't hit 'sim4end' before reading all the alignment lines, and if
+ // we did, we'd compain about it and remove the alignment lines. Too much work.
+ //
+ if (strcmp(lines[cl], "sim4end") != 0) {
+ for (uint32 el=0; el<_numExons; el++) {
+ _exons[el]._estAlignment = new char [lengths[cl] + 1];
+ strcpy(_exons[el]._estAlignment, lines[cl]);
+ cl++;
+
+ _exons[el]._genAlignment = new char [lengths[cl] + 1];
+ strcpy(_exons[el]._genAlignment, lines[cl]);
+ cl++;
+ }
+ }
+}
+
+
+// NOTE: This alters the lines array, with strtok()
+void
+sim4polish::s4p_linesToPolishGFF3(uint32 startPosition,
+ uint32 maxLines,
+ char **lines,
+ uint32 *lengths) {
+ char mOri;
+ char sOri;
+ char *clptr;
+ int matchID;
+ char *tok, *crttok;
+ int dummy1, dummy2;
+ char dummybuf[1000];
+
+ uint32 r;
+ bool ok = true;
+
+ assert(_comment == 0L);
+ assert(_estDefLine == 0L);
+ assert(_genDefLine == 0L);
+ assert(_exons == 0L);
+ assert(_numExons == 0);
+
+ // Don't need to store matchID; re-assigned when file changes
+
+ uint32 cl = 0;
+ for (cl=0; lines[cl] && (lines[cl][0]=='#'); cl++);
+ if (lines[cl] == NULL) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- Empty record. Cannot convert (%s).\n", lines[0]);
+ return;
+ }
+
+ if (!strcmp(lines[0], "\tsim4db\tmRNA")) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- Invalid GFF3 format, got '%s' instead of GFF3 mRNA line. Cannot convert.\n",
+ lines[0]);
+ return;
+ }
+
+ cl = 0; while (lines[cl] && (lines[cl][0] == '#')) cl++;
+ if (lines[cl] == NULL) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- ERROR: Critical error when reading GFF3 record. Skipping.\n");
+ return;
+ }
+
+
+ // Scan mRNA line
+
+ _genDefLine = new char [lengths[cl]];
+
+ r = sscanf(lines[cl], ""uint32FMT":%s\tsim4db\tmRNA\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%c\t.\t",
+ &_genID, _genDefLine, &dummy1, &dummy2, &_percentIdentity, &sOri);
+ if (r != 6) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- byte "uint32FMT": '%s'\n", startPosition, lines[cl]);
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- Expecting description line, found %d tokens instead of 6.\n", r);
+ }
+
+ switch (sOri) {
+ case '+' : _strandOrientation = SIM4_STRAND_POSITIVE; break;
+ case '-' : _strandOrientation = SIM4_STRAND_NEGATIVE; break;
+ case '.' : _strandOrientation = SIM4_STRAND_UNKNOWN; break;
+ default : ok = false;
+ }
+
+
+ if (ok == true) {
+ // skip over the first eight columns in the GFF3 format
+ clptr = lines[cl];
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+
+ tok = strtok(clptr, "\n");
+
+ crttok = strtok(tok, ";");
+ while (crttok) {
+ if (!strncmp(crttok, "ID=sim4db", 9)) {
+ r = sscanf(crttok, "ID=sim4db"uint32FMT"", &matchID);
+ if (r != 1) ok = false;
+ } else if (!strncmp(crttok, "Name", 4)) {
+ if (_estDefLine == 0L)
+ _estDefLine = new char [lengths[cl]];
+ r = sscanf(crttok, "Name="uint32FMT":%s", &_estID, _estDefLine);
+ if (r != 2) ok = false;
+ } else if (!strncmp(crttok, "Target", 6)) {
+ if (_estDefLine == 0L)
+ _estDefLine = new char [lengths[cl]];
+ r = sscanf(crttok, "Target="uint32FMT":%s "uint32FMT" "uint32FMT" %c", &_estID, _estDefLine, &dummy1, &dummy2, &mOri);
+ if (r != 5) ok = false;
+ if (mOri == '+') _matchOrientation = SIM4_MATCH_FORWARD;
+ else
+ if (mOri == '-') _matchOrientation = SIM4_MATCH_COMPLEMENT;
+ else
+ ok = false;
+ } else if (!strncmp(crttok, "targetLen", 9)) {
+ r = sscanf(crttok, "targetLen="uint32FMT"", &_estLen);
+ if (r != 1) ok = false;
+ } else if (!strncmp(crttok, "pA", 2)) {
+ r = sscanf(crttok, "pA="uint32FMT"", &_estPolyA);
+ if (r != 1) ok = false;
+ } else if (!strncmp(crttok, "pT", 2)) {
+ r = sscanf(crttok, "pT="uint32FMT"", &_estPolyT);
+ if (r != 1) ok = false;
+ } else if (!strncmp(crttok, "genRegion", 9)) {
+ r = sscanf(crttok, "genRegion="uint32FMT"-"uint32FMT"", &_genRegionOffset, &dummy1);
+ if (r != 2) ok = false;
+ else
+ _genRegionLength = dummy1 - _genRegionOffset + 1;
+ }
+
+ crttok = strtok(NULL, ";");
+ }
+
+ // Check that we read what we should have read so far
+ if ((ok == false) || !_estDefLine || !_genDefLine || !_estLen || !_matchOrientation || !_strandOrientation) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- byte "uint32FMT": '%s'\n", startPosition, lines[cl]);
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- Expecting mRNA description line, %s.\n", (ok==false) ? "failed":"incomplete");
+ }
+ }
+
+
+ //
+ // While we get exons, make exons.
+ //
+
+ sim4polishExon exon;
+ uint32 maxExons = 1024;
+
+ _numExons = 0;
+ _exons = new sim4polishExon [maxExons];
+
+ _numCovered = 0;
+
+ cl++; while (lines[cl] && (lines[cl][0] == '#')) cl++;
+
+ while (lines[cl] && strstr(lines[cl], "\tsim4db\texon\t")) {
+
+ ok = true;
+
+ exon._intronOrientation = SIM4_INTRON_NONE;
+
+ r = sscanf(lines[cl], ""uint32FMT":%s\tsim4db\texon\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%c\t.\t",
+ &dummy1, dummybuf, &exon._genFrom, &exon._genTo, &exon._percentIdentity, &sOri);
+ if (r != 6) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- byte "uint32FMT": '%s'\n", startPosition, lines[cl]);
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- Expecting exon description line, found %d tokens instead of 6.\n", r);
+ }
+
+ if ((dummy1 != _genID) || strcmp(dummybuf, _genDefLine) ||
+ (sOri != '+') && (sOri != '-') && (sOri != '.'))
+ ok = false;
+
+ if (ok) {
+ clptr = lines[cl];
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+ while (*clptr!='\t') clptr++; clptr++;
+
+ tok = strtok(clptr, "\n");
+
+ crttok = strtok(tok, ";");
+ while (crttok) {
+ if (!strncmp(crttok, "Parent=sim4db", 13)) {
+ r = sscanf(crttok, "Parent=sim4db"uint32FMT"", &dummy1);
+ if ((r != 1) || (dummy1 != matchID)) ok = false;
+
+ } else if (!strncmp(crttok, "Target=", 7)) {
+ r = sscanf(crttok, "Target=%s "uint32FMT" "uint32FMT" %c", &dummybuf, &exon._estFrom, &exon._estTo, &mOri);
+ if ((r != 4) ||
+ ((mOri == '+') && (_matchOrientation == SIM4_MATCH_COMPLEMENT)) ||
+ ((mOri == '-') && (_matchOrientation == SIM4_MATCH_FORWARD)))
+ ok = false;
+
+ } else if (!strncmp(crttok, "nMatches=", 9)) {
+ r = sscanf(crttok, "nMatches="uint32FMT"", &exon._numMatches);
+ if (r != 1) ok = false;
+ } else if (!strncmp(crttok, "Gap=", 4)) {
+ ; // Handle this later or, better yet, just skip alignment
+
+ } else if (!strncmp(crttok, "intron=", 7)) {
+ r = sscanf(crttok, "intron=%s", &dummybuf);
+ if (r != 1) ok = false;
+ if (!strcmp(dummybuf, "->"))
+ exon._intronOrientation = SIM4_INTRON_POSITIVE;
+ else if (!strcmp(dummybuf, "<-"))
+ exon._intronOrientation = SIM4_INTRON_NEGATIVE;
+ else if (!strcmp(dummybuf, "--"))
+ exon._intronOrientation = SIM4_INTRON_AMBIGUOUS;
+ else if (!strcmp(dummybuf, "=="))
+ exon._intronOrientation = SIM4_INTRON_GAP;
+ else
+ ok = false;
+ }
+
+ crttok = strtok(NULL, ";");
+ }
+ }
+
+ // Check that we read what we should have read so far
+ if ((ok == false) || !exon._estFrom || !exon._estTo || !exon._numMatches) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- byte "uint32FMT": '%s'\n", startPosition, lines[cl]);
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- Expecting exon description line, %s.\n", (ok==false) ? "failed":"incomplete");
+ }
+
+ // Now load everything into the real exons array:
+ // Dang, out of space! This would be a chore, except we don't have alignments yet, and so can
+ // get by with a shallow copy.
+ //
+ if (_numExons >= maxExons) {
+ maxExons *= 2;
+ sim4polishExon *nnn = new sim4polishExon [maxExons];
+ memcpy(nnn, _exons, sizeof(sim4polishExon) * _numExons);
+ delete [] _exons;
+ _exons = nnn;
+ }
+
+ _exons[_numExons] = exon;
+
+ _exons[_numExons]._numMatchesN = 0; // Most likely!
+
+ _exons[_numExons]._estAlignment = 0L;
+ _exons[_numExons]._genAlignment = 0L;
+
+ _numCovered += _exons[_numExons]._estTo - _exons[_numExons]._estFrom + 1;
+ _numMatches += _exons[_numExons]._numMatches;
+ _numMatchesN += _exons[_numExons]._numMatchesN;
+
+ _numExons++;
+
+ cl++;
+ while (lines[cl] && (lines[cl][0] == '#')) cl++;
+ }
+
+ if (_numExons == 0) {
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- byte "uint32FMT": '%s'\n", startPosition, lines[cl]);
+ fprintf(stderr, "sim4polish::s4p_linesToPolishGFF3()-- WARNING: found ZERO exons?\n");
+ }
+
+ _querySeqIdentity = s4p_percentCoverageApprox();
+
+}
+
diff --git a/libsim4/sim4polish/sim4polish-updatescores.C b/libsim4/sim4polish/sim4polish-updatescores.C
new file mode 100644
index 0000000..5ce0dc0
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish-updatescores.C
@@ -0,0 +1,233 @@
+#include <math.h>
+#include "sim4polish.H"
+
+
+void
+sim4polish::s4p_updateAlignmentScores(void) {
+ uint32 ni = 0, numInDel = 0;
+ uint32 ne = 0, numEdits = 0;
+ uint32 nn = 0, numMatchesN = 0;
+ uint32 nm = 0, numMatches = 0;
+ uint32 al = 0, alignmentLength = 0;
+ uint32 nc = 0, numCovered = 0;
+
+ uint32 estn = 0;
+ uint32 genn = 0;
+
+ for (uint32 exon=0; exon<_numExons; exon++) {
+ char *est = _exons[exon]._estAlignment;
+ char *gen = _exons[exon]._genAlignment;
+
+ al = 0;
+
+ ni = 0;
+ ne = 0;
+ nn = 0;
+ nm = 0;
+
+ if (est && gen) {
+ while (*est && *gen) {
+ estn = (*est == 'N') || (*est == 'n');
+ genn = (*gen == 'N') || (*gen == 'n');
+
+ if ((*est == '-') || (*gen == '-')) {
+ ni++;
+ ne++;
+ *est = toupper(*est);
+ *gen = toupper(*gen);
+ } else if (estn && genn) {
+ // Both are N. It isn't a match and it isn't an edit.
+ //
+ nn++;
+ *est = toupper(*est);
+ *gen = toupper(*gen);
+ } else if (estn || genn) {
+ // One is an N. Someone has low quality sequence, and we
+ // should penalize. We need to special case this because
+ // IUPACidentity[][] claims N matches all.
+ //
+ ne++;
+ *est = toupper(*est);
+ *gen = toupper(*gen);
+ } else if (IUPACidentity[(int)*est][(int)*gen]) {
+ // Got a match.
+ nm++;
+ *est = tolower(*est);
+ *gen = tolower(*gen);
+ } else {
+ // Got a substitution
+ ne++;
+ *est = toupper(*est);
+ *gen = toupper(*gen);
+ }
+
+ est++;
+ gen++;
+ }
+ }
+
+ _exons[exon]._numMatches = nm;
+ _exons[exon]._numMatchesN = nn;
+
+ al = (_exons[exon]._genTo - _exons[exon]._genFrom + 1 +
+ _exons[exon]._estTo - _exons[exon]._estFrom + 1 +
+ ne);
+ nc = (_exons[exon]._estTo - _exons[exon]._estFrom + 1);
+
+ _exons[exon]._percentIdentity = s4p_percentIdentityApprox(ne, al);
+
+ numInDel += ni;
+ numEdits += ne;
+ numMatchesN += nn;
+ numMatches += nm;
+ alignmentLength += al;
+ numCovered += nc;
+ }
+
+ _numMatches = numMatches;
+ _numMatchesN = numMatchesN;
+ _numCovered = numCovered;
+
+#if 0
+ fprintf(stderr, "numInDel = %d\n", numInDel);
+ fprintf(stderr, "numEdits = %d\n", numEdits);
+ fprintf(stderr, "numMatchesN = %d\n", numMatchesN);
+ fprintf(stderr, "numMatches = %d\n", numMatches);
+ fprintf(stderr, "alignLen = %d\n", alignmentLength);
+ fprintf(stderr, "numCovered = %d\n", numCovered);
+#endif
+
+ _percentIdentity = s4p_percentIdentityApprox(numEdits, alignmentLength);
+ _querySeqIdentity = s4p_percentCoverageApprox();
+}
+
+
+int
+sim4polish::s4p_percentCoverageApprox(void) {
+ int ret;
+
+ if (_numCovered == _estLen - _estPolyA - _estPolyT)
+ return(100);
+
+ return(((ret=(int)round(100.0 * _numCovered / (double)(_estLen - _estPolyA - _estPolyT))) < 100) ? ret : 99);
+}
+
+
+int
+sim4polish::s4p_percentIdentityApprox(int numEdits, int alignmentLength) {
+ int ret;
+
+ if (alignmentLength == 0)
+ return(0);
+
+ if (numEdits == 0)
+ return(100);
+
+ return(((ret=(int)round(100.0 * (1 - 2.0 * numEdits / alignmentLength))) < 100) ? ret : 99);
+}
+
+
+double
+sim4polish::s4p_percentCoverageExact(void) {
+ return( 100 * (double)(_numCovered) / (double)(_estLen - _estPolyA - _estPolyT) );
+}
+
+
+double
+sim4polish::s4p_percentIdentityExact(void) {
+ uint32 ni = 0, numInDel = 0;
+ uint32 ne = 0, numEdits = 0;
+ uint32 nn = 0, numMatchesN = 0;
+ uint32 nm = 0, numMatches = 0;
+ uint32 al = 0, alignmentLength = 0;
+ uint32 nc = 0, numCovered = 0;
+
+ uint32 estn = 0;
+ uint32 genn = 0;
+
+ double ret = 0.0;
+
+ for (uint32 exon=0; exon<_numExons; exon++) {
+ char *est = _exons[exon]._estAlignment;
+ char *gen = _exons[exon]._genAlignment;
+
+ al = 0;
+
+ ni = 0;
+ ne = 0;
+ nn = 0;
+ nm = 0;
+
+ if (est && gen) {
+ while (*est && *gen) {
+ estn = (*est == 'N') || (*est == 'n');
+ genn = (*gen == 'N') || (*gen == 'n');
+
+ if ((*est == '-') || (*gen == '-')) {
+ ni++;
+ ne++;
+ } else if (estn && genn) {
+ // Both are N. It isn't a match and it isn't an edit.
+ //
+ nn++;
+ } else if (estn || genn) {
+ // One is an N. Someone has low quality sequence, and we
+ // should penalize. We need to special case this because
+ // IUPACidentity[][] claims N matches all.
+ //
+ ne++;
+ } else if (IUPACidentity[(int)*est][(int)*gen]) {
+ // Got a match.
+ nm++;
+ } else {
+ // Got a substitution
+ ne++;
+ }
+
+ est++;
+ gen++;
+ }
+ }
+
+#if 0
+ _exons[exon]._numMatches = nm;
+ _exons[exon]._numMatchesN = nn;
+#endif
+
+ al = (_exons[exon]._genTo - _exons[exon]._genFrom + 1 +
+ _exons[exon]._estTo - _exons[exon]._estFrom + 1 +
+ ne);
+ nc = (_exons[exon]._genTo - _exons[exon]._genFrom + 1);
+
+#if 0
+ _exons[exon]._percentIdentity = s4p_percentIdentityApprox(ne, al);
+#endif
+
+ numInDel += ni;
+ numEdits += ne;
+ numMatchesN += nn;
+ numMatches += nm;
+ alignmentLength += al;
+ numCovered += nc;
+ }
+
+#if 0
+ _numMatches = numMatches;
+ _numMatchesN = numMatchesN;
+ _numCovered = numCovered;
+#endif
+
+#if 0
+ fprintf(stderr, "numInDel = %d\n", numInDel);
+ fprintf(stderr, "numEdits = %d\n", numEdits);
+ fprintf(stderr, "numMatchesN = %d\n", numMatchesN);
+ fprintf(stderr, "numMatches = %d\n", numMatches);
+ fprintf(stderr, "alignLen = %d\n", alignmentLength);
+ fprintf(stderr, "numCovered = %d\n", numCovered);
+#endif
+
+ if (alignmentLength > 0)
+ ret = 100.0 * (1 - 2.0 * numEdits / (double)(alignmentLength));
+
+ return(ret);
+}
diff --git a/libsim4/sim4polish/sim4polish.C b/libsim4/sim4polish/sim4polish.C
new file mode 100644
index 0000000..7160ad4
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish.C
@@ -0,0 +1,34 @@
+#include "sim4polish.H"
+
+bool
+sim4polish::s4p_makeForward(void) {
+ if (_matchOrientation == SIM4_MATCH_FORWARD)
+ return(false);
+
+ for (uint32 e=0; e < _numExons; e++) {
+ uint32 t = _estLen - _exons[e]._estFrom + 1;
+ _exons[e]._estFrom = _estLen - _exons[e]._estTo + 1;
+ _exons[e]._estTo = t;
+ }
+
+ _matchOrientation = SIM4_MATCH_FORWARD;
+
+ return(true);
+}
+
+
+bool
+sim4polish::s4p_makeReverse(void) {
+ if (_matchOrientation == SIM4_MATCH_COMPLEMENT)
+ return(false);
+
+ for (uint32 e=0; e < _numExons; e++) {
+ uint32 t = _estLen - _exons[e]._estFrom + 1;
+ _exons[e]._estFrom = _estLen - _exons[e]._estTo + 1;
+ _exons[e]._estTo = t;
+ }
+
+ _matchOrientation = SIM4_MATCH_COMPLEMENT;
+
+ return(true);
+}
diff --git a/libsim4/sim4polish/sim4polish.H b/libsim4/sim4polish/sim4polish.H
new file mode 100644
index 0000000..9fcc87f
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish.H
@@ -0,0 +1,287 @@
+#ifndef SIM4_POLISH_H
+#define SIM4_POLISH_H
+
+//
+// Datastructures for writing, processing and reading the output of sim4
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util++.H"
+#include "bio++.H"
+//#include "bio.h"
+
+#define SIM4_INTRON_ERROR '?' // '??'
+#define SIM4_INTRON_POSITIVE '>' // '->'
+#define SIM4_INTRON_NEGATIVE '<' // '<-'
+#define SIM4_INTRON_AMBIGUOUS '-' // '--'
+#define SIM4_INTRON_GAP '=' // '=='
+#define SIM4_INTRON_NONE '.' // ' '
+
+#define SIM4_MATCH_ERROR '?'
+#define SIM4_MATCH_FORWARD 'f'
+#define SIM4_MATCH_COMPLEMENT 'c'
+
+#define SIM4_STRAND_ERROR '?'
+#define SIM4_STRAND_POSITIVE 'p'
+#define SIM4_STRAND_NEGATIVE 'n'
+#define SIM4_STRAND_UNKNOWN 'u'
+#define SIM4_STRAND_INTRACTABLE 'I'
+#define SIM4_STRAND_FAILED 'F'
+
+#define S4P_POLISH_S4DB 100
+#define S4P_POLISH_GFF3 101
+#define S4P_POLISH_ATAC 102
+
+enum sim4polishStyle {
+ sim4polishS4DB,
+ sim4polishGFF3,
+ sim4polishATAC
+};
+
+extern sim4polishStyle sim4polishStyleDefault;
+
+// sim4polishExon and sim4polish constructors should be private (with whatever builds them a
+// friend) but snapper2 needs to create empty objects and fill them manually. We could provide a
+// constructor for that, but time is finite.
+
+class sim4polishExon {
+public:
+ sim4polishExon() {
+ _estFrom = 0;
+ _estTo = 0;
+ _genFrom = 0;
+ _genTo = 0;
+ _numMatches = 0;
+ _numMatchesN = 0;
+ _percentIdentity = 0;
+ _intronOrientation = 0;
+ _estAlignment = NULL;
+ _genAlignment = NULL;
+ };
+
+ ~sim4polishExon() {
+ delete [] _estAlignment;
+ delete [] _genAlignment;
+ };
+
+ void s4p_clearExon(void) {
+ _estFrom = 0;
+ _estTo = 0;
+ _genFrom = 0;
+ _genTo = 0;
+ _numMatches = 0;
+ _numMatchesN = 0;
+ _percentIdentity = 0;
+ _intronOrientation = 0;
+ _estAlignment = 0L;
+ _genAlignment = 0L;
+ };
+
+ void s4p_copyExon(sim4polishExon *orig);
+
+public:
+ uint32 _estFrom;
+ uint32 _estTo;
+ uint32 _genFrom;
+ uint32 _genTo;
+ uint32 _numMatches;
+ uint32 _numMatchesN;
+ uint32 _percentIdentity;
+ uint32 _intronOrientation;
+ char *_estAlignment;
+ char *_genAlignment;
+};
+
+
+
+class sim4polish {
+public:
+ void clear(void) {
+ _estID = 0;
+ _estLen = 0;
+ _estPolyA = 0;
+ _estPolyT = 0;
+
+ _genID = 0;
+ _genRegionOffset = 0;
+ _genRegionLength = 0;
+
+ _numMatches = 0;
+ _numMatchesN = 0;
+ _numCovered = 0;
+ _percentIdentity = 0;
+ _querySeqIdentity = 0;
+ _matchOrientation = 0;
+ _strandOrientation = 0;
+
+ _comment = NULL;
+ _estDefLine = NULL;
+ _genDefLine = NULL;
+
+ _numExons = 0;
+ _exons = NULL;
+ };
+
+
+ sim4polish() {
+ clear();
+ };
+
+ friend class sim4polishBuilder;
+ friend class sim4polishReader;
+ friend class sim4polishWriter;
+public:
+ // OBSOLETE
+ //sim4polish(FILE *F) {
+ // fprintf(stderr, "OBSOLETE.\n");
+ // exit(1);
+ //};
+
+ sim4polish(readBuffer *rb, sim4polishStyle style) {
+ clear();
+
+ switch (style) {
+ case sim4polishS4DB:
+ s4p_readPolishS4DB(rb);
+ break;
+ case sim4polishGFF3:
+ s4p_readPolishGFF3(rb);
+ break;
+ case sim4polishATAC:
+ s4p_readPolishATAC(rb);
+ break;
+ default:
+ fprintf(stderr, "sim4polish()-- ERROR: unknown style '%d'\n", style);
+ exit(1);
+ }
+ };
+
+ sim4polish(sim4polish *orig) {
+ clear();
+ s4p_copyPolish(orig);
+ };
+
+ sim4polish(sim4polish *orig, uint32 exon) {
+ clear();
+ s4p_copyPolish(orig, exon);
+ };
+
+ ~sim4polish() {
+ delete [] _comment;
+ delete [] _estDefLine;
+ delete [] _genDefLine;
+ delete [] _exons;
+ };
+
+private:
+ void s4p_readPolishS4DB(readBuffer *rb);
+ void s4p_readPolishGFF3(readBuffer *rb);
+ void s4p_readPolishATAC(readBuffer *rb);
+
+ void s4p_linesToPolishS4DB(uint32 lineNumber, uint32 maxLines, char **lines, uint32 *lengths);
+ void s4p_linesToPolishGFF3(uint32 lineNumber, uint32 maxLines, char **lines, uint32 *lengths);
+ void s4p_linesToPolishATAC(uint32 lineNumber, uint32 maxLines, char **lines, uint32 *lengths);
+
+ void s4p_copyPolish(sim4polish *orig, uint32 exonNum=2147483648);
+
+public:
+
+ // Note that there is no (public) mechanism to convert these strings back to a sim4polish. The
+ // only mechanism is through a readBuffer (aka, a file).
+ //
+ char *s4p_polishToString(sim4polishStyle style); // STYLE - add =sim4polishS4DB
+private:
+ char *s4p_polishToStringS4DB(void);
+ char *s4p_polishToStringGFF3(void);
+ char *s4p_polishToStringATAC(void);
+
+public:
+ void s4p_removeAlignments(void) {
+ for (uint32 i=0; i<_numExons; i++) {
+ delete [] _exons[i]._estAlignment; _exons[i]._estAlignment = 0L;
+ delete [] _exons[i]._genAlignment; _exons[i]._genAlignment = 0L;
+ }
+ };
+ void s4p_removeDefLines(void) {
+ delete [] _estDefLine; _estDefLine = 0L;
+ delete [] _genDefLine; _genDefLine = 0L;
+ };
+
+ // Reverse complement an input polish, returns true of it was reversed.
+ //
+ bool s4p_makeForward(void);
+ bool s4p_makeReverse(void);
+
+ // Update the alignment scores based on the alignments that are present.
+ //
+ void s4p_updateAlignmentScores(void);
+
+ // Approximate (integer) percent identity and coverage.
+ //
+ int s4p_percentIdentityApprox(int numEdits, int alignmentLength);
+ int s4p_percentCoverageApprox(void);
+
+ // A very expensive and accurate calculation of the percent identity.
+ //
+ double s4p_percentIdentityExact(void);
+ double s4p_percentCoverageExact(void);
+
+ void s4p_swapExons(uint32 a, uint32 b);
+ void s4p_deleteExon(uint32 a);
+ void s4p_insertExon(uint32 a, uint32 intronori, sim4polishExon *e);
+ void s4p_insertExons(uint32 a, uint32 intronori, sim4polish *e);
+
+public:
+ uint32 _estID;
+ uint32 _estLen;
+ uint32 _estPolyA;
+ uint32 _estPolyT;
+
+ uint32 _genID;
+ uint32 _genRegionOffset;
+ uint32 _genRegionLength;
+
+ uint32 _numMatches;
+ uint32 _numMatchesN;
+ uint32 _numCovered; // Number of bp covered in alignments
+ uint32 _percentIdentity;
+ uint32 _querySeqIdentity; // numCovered / (estLen - pA -pT)
+ uint32 _matchOrientation;
+ uint32 _strandOrientation;
+
+ char *_comment;
+ char *_estDefLine;
+ char *_genDefLine;
+
+ uint32 _numExons;
+ sim4polishExon *_exons;
+};
+
+int s4p_genIDcompare(const void *a, const void *b);
+int s4p_estIDcompare(const void *a, const void *b);
+
+int s4p_genDEFcompare(const void *a, const void *b);
+int s4p_estDEFcompare(const void *a, const void *b);
+
+bool s4p_compatable(sim4polish *A, sim4polish *B);
+bool s4p_IsSameRegion(sim4polish *A, sim4polish *B, int tolerance);
+bool s4p_IsRegionOverlap(sim4polish *A, sim4polish *B);
+bool s4p_IsSameExonModel(sim4polish *A, sim4polish *B, int tolerance);
+
+void s4p_compareExons_Overlap(sim4polish *A,
+ sim4polish *B,
+ double overlapThreshold,
+ uint32 *numSame,
+ uint32 *numAOnly,
+ uint32 *numBOnly);
+
+void s4p_compareExons_Ends(sim4polish *A,
+ sim4polish *B,
+ int32 tolerance,
+ uint32 *numSame,
+ uint32 *numAOnly,
+ uint32 *numBOnly);
+
+#endif // SIM4_POLISH_H
diff --git a/libsim4/sim4polish/sim4polish.pm b/libsim4/sim4polish/sim4polish.pm
new file mode 100644
index 0000000..87ffc22
--- /dev/null
+++ b/libsim4/sim4polish/sim4polish.pm
@@ -0,0 +1,254 @@
+#!/usr/local/bin/perl
+
+# Confidential -- Do Not Distribute
+# Copyright (c) 2002 PE Corporation (NY) through the Celera Genomics Group
+# All Rights Reserved.
+
+package sim4polish;
+
+use strict;
+use POSIX "sys_wait_h";
+
+$| = 1;
+
+sub import () {
+}
+
+
+######################################################################
+#
+# Returns a modified 'raw' string, using the current values for the
+# info line. DOES NOT rewrite the exons.
+#
+sub updatePolishInfoLine {
+ my %p = @_;
+ my @L = split '\n', $p{'raw'};
+ my $l;
+
+ shift @L;
+ shift @L;
+
+ $l = "sim4begin\n";
+ $l .= "$p{'estID'}\[$p{'estLen'}-$p{'pA'}-$p{'pT'}\] ";
+ $l .= "$p{'dbID'}\[$p{'dbLo'}-$p{'dbHi'}\] ";
+ $l .= "<$p{'numMatches'}-$p{'numMatchesN'}-$p{'percentID'}-$p{'matchOrientation'}-$p{'strandPrediction'}>\n";
+
+ foreach my $x (@L) {
+ $l .= "$x\n";
+ }
+
+ return($l);
+}
+
+
+sub updatePolish {
+ my %p = @_;
+ my $l;
+
+ $l = "sim4begin\n";
+ $l .= "$p{'estID'}\[$p{'estLen'}-$p{'pA'}-$p{'pT'}\] ";
+ $l .= "$p{'dbID'}\[$p{'dbLo'}-$p{'dbHi'}\] ";
+ $l .= "<$p{'numMatches'}-$p{'numMatchesN'}-$p{'percentID'}-$p{'matchOrientation'}-$p{'strandPrediction'}>\n";
+
+ $l .= "comment=$p{'comment'}\n" if defined($p{'comment'});
+ $l .= "edef=$p{'estDefLine'}\n" if defined($p{'estDefLine'});
+ $l .= "ddef=$p{'dbDefLine'}\n" if defined($p{'estDefLine'});
+
+ foreach my $exon (@{@p{'exons'}}) {
+ my $e;
+ $e = "$exon->{'cDNAstart'}-$exon->{'cDNAend'} ";
+ $e .= "($exon->{'GENOMICstart'}-$exon->{'GENOMICend'}) ";
+ $e .= "<$exon->{'numMatches'}-$exon->{'numMatchesN'}-$exon->{'percentID'}> ";
+ $e .= "$exon->{'intronOrientation'}";
+
+ $e =~ s/^\s+//;
+ $e =~ s/\s+$//;
+
+ $l .= "$e\n";
+ }
+
+ foreach my $exon (@{@p{'exons'}}) {
+ $l .= "$exon->{'cDNAalign'}\n";
+ $l .= "$exon->{'GENOMICalign'}\n";
+ }
+
+ $l .= "sim4end\n";
+
+ return($l);
+}
+
+
+
+
+######################################################################
+#
+# Subroutine to read a single sim4 polish, and return it as a structure.
+#
+sub readPolish (*) {
+ local *READPOLISHFH = shift;
+ my %p;
+ my $line;
+ my $save;
+
+ # These are the fields returned.
+ #
+ $p{'raw'} = undef;
+
+ $p{'estID'} = undef;
+ $p{'estDefLine'} = undef;
+ $p{'estLen'} = undef;
+ $p{'pA'} = undef;
+ $p{'pT'} = undef;
+
+ $p{'dbID'} = undef;
+ $p{'dbDefLine'} = undef;
+ $p{'dbLen'} = undef;
+ $p{'dbLo'} = undef;
+ $p{'dbHi'} = undef;
+
+ $p{'comment'} = undef;
+
+ $p{'numMatches'} = undef;
+ $p{'numMatchesN'} = undef;
+ $p{'percentID'} = undef;
+ $p{'coveragebp'} = undef;
+ $p{'coverage'} = undef;
+ $p{'matchOrientation'} = undef;
+ $p{'strandPrediction'} = undef;
+
+ # An array of references to hashes, one hash for each exon.
+ $p{'exons'} = ();
+
+
+ # Skip lines until the next match. If used properly, on a proper
+ # file, this should be silent. After the loop, we are at the
+ # start of a polish; the line should be "sim4begin".
+ #
+ $line = <READPOLISHFH>;
+ while (defined($line) && ($line !~ m/^sim4begin$/)) {
+ chomp $line;
+ print STDERR "Skipped: '$line'\n";
+ $line = <READPOLISHFH>;
+ }
+ $save = $line;
+
+ # Return now if were are out of file
+ #
+ return(%p) if (eof(READPOLISHFH));
+
+
+ # Read the description line
+ #
+ $line = <READPOLISHFH>;
+ $save .= $line;
+
+ if ($line =~ m/^(\d+)\[(\d+)-+(\d+)-+(\d+)\]\s+(\d+)\[(\d+)-(\d+)\]\s+\<(\d+)-(\d+)-(\d+)-(\w+)-(\w+)\>$/) {
+ $p{'estID'} = $1;
+ $p{'estLen'} = $2;
+ $p{'pA'} = $3;
+ $p{'pT'} = $4;
+ $p{'dbID'} = $5;
+ $p{'dbLo'} = $6;
+ $p{'dbHi'} = $7;
+ $p{'numMatches'} = $8;
+ $p{'numMatchesN'} = $9;
+ $p{'percentID'} = $10;
+ $p{'matchOrientation'} = $11;
+ $p{'strandPrediction'} = $12;
+ } else {
+ print STDERR "expecting description line, got: '$line'\n";
+ return(%p);
+ }
+
+
+ # Read the two deflines, if they exist.
+ #
+ $line = <READPOLISHFH>;
+
+ if ($line =~ m/^comment=\s*(.*)\s*$/) {
+ $p{'comment'} = $1;
+ $save .= $line;
+ $line = <READPOLISHFH>;
+ } else {
+ #print STDERR "libBri::readPolish()-- WARNING: Didn't get comment!\n";
+ #print STDERR "libBri::readPolish()-- WARNING: $line";
+ }
+ if ($line =~ m/^edef=(.*)$/) {
+ $p{'estDefLine'} = $1;
+ $save .= $line;
+ $line = <READPOLISHFH>;
+ } else {
+ #print STDERR "libBri::readPolish()-- WARNING: Didn't get edef!\n";
+ #print STDERR "libBri::readPolish()-- WARNING: $line";
+ }
+
+ if ($line =~ m/^ddef=(.*)$/) {
+ $p{'dbDefLine'} = $1;
+ $save .= $line;
+ $line = <READPOLISHFH>;
+ } else {
+ #print STDERR "libBri::readPolish()-- WARNING: Didn't get ddef!\n";
+ #print STDERR "libBri::readPolish()-- WARNING: $line";
+ }
+
+
+ # Read the exons
+ #
+ my $exonAlign = 0;
+ my $exonAlignFirst = 1;
+ my $exonCoverage = 0;
+
+ while (defined($line) && ($line !~ m/^sim4end$/)) {
+
+ # If this match succeeds, we have an exon description.
+ # Otherwise, it's an alignment line.
+ #
+ if ($line =~ /^(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+\<(\d+)-(\d+)-(\d+)\>\s+(.*)$/) {
+ my $e = {};
+
+ $exonCoverage += $2 - $1 + 1;
+
+ $e->{'cDNAstart'} = $1;
+ $e->{'cDNAend'} = $2;
+ $e->{'GENOMICstart'} = $3;
+ $e->{'GENOMICend'} = $4;
+ $e->{'numMatches'} = $5;
+ $e->{'numMatchesN'} = $6;
+ $e->{'percentID'} = $7;
+ $e->{'intronOrientation'} = $8;
+
+ push @{$p{'exons'}}, $e;
+ } else {
+ if ($exonAlignFirst) {
+ $p{'exons'}[$exonAlign]->{'cDNAalign'} = $line;
+ chomp $p{'exons'}[$exonAlign]->{'cDNAalign'};
+ $exonAlignFirst = 0;
+ } else {
+ $p{'exons'}[$exonAlign]->{'GENOMICalign'} = $line;
+ chomp $p{'exons'}[$exonAlign]->{'GENOMICalign'};
+ $exonAlignFirst = 1;
+ $exonAlign++;
+ }
+ }
+
+ $save .= $line;
+ $line = <READPOLISHFH>;
+ }
+
+ $save .= $line;
+
+ if (($p{'pA'} + $p{'pT'}) >= $p{'estLen'}) {
+ $p{'coverage'} = 0;
+ } else {
+ $p{'coveragebp'} = $exonCoverage;
+ $p{'coverage'} = 100.0 * $exonCoverage / ($p{'estLen'} - $p{'pA'} - $p{'pT'});
+ }
+
+ $p{'raw'} = $save;
+
+ return(%p);
+}
+
+
+
+1;
diff --git a/libsim4/sim4polish/sim4polishBuilder.C b/libsim4/sim4polish/sim4polishBuilder.C
new file mode 100644
index 0000000..de22922
--- /dev/null
+++ b/libsim4/sim4polish/sim4polishBuilder.C
@@ -0,0 +1,264 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <math.h>
+#include "bio++.H"
+#include "sim4polishBuilder.H"
+
+
+sim4polishBuilder::sim4polishBuilder() {
+ it = 0L;
+
+ exPos = 0;
+ exMax = 32;
+ exAli = 0;
+ ex = new sim4polishExon * [exMax];
+
+ for (uint32 i=0; i<exMax; i++)
+ ex[i] = 0L;
+}
+
+sim4polishBuilder::~sim4polishBuilder() {
+ delete it;
+
+ for (uint32 i=0; i<exMax; i++)
+ delete ex[i];
+
+ delete [] ex;
+}
+
+
+void
+sim4polishBuilder::create(uint32 estid, uint32 estlen,
+ uint32 genid, uint32 genlo, uint32 genhi) {
+
+ // Someone didn't call release()!!
+ //
+ if (it) {
+ fprintf(stderr, "sim4polishBuilder::create()-- WARNING: release() not called. Polish killed.\n");
+ delete it;
+ }
+
+ it = new sim4polish;
+
+ it->_estID = estid;
+ it->_estLen = estlen;
+ it->_estPolyA = 0;
+ it->_estPolyT = 0;
+
+ it->_genID = genid;
+ it->_genRegionOffset = genlo;
+ it->_genRegionLength = genhi - genlo;
+
+ it->_numMatches = 0;
+ it->_numMatchesN = 0;
+ it->_numCovered = 0;
+ it->_percentIdentity = 0;
+ it->_querySeqIdentity = 0;
+ it->_matchOrientation = SIM4_MATCH_ERROR;
+ it->_strandOrientation = SIM4_STRAND_ERROR;
+
+ it->_comment = 0L;
+ it->_estDefLine = 0L;
+ it->_genDefLine = 0L;
+
+ it->_numExons = 0;
+ it->_exons = 0L;
+}
+
+void
+sim4polishBuilder::setPolyTails(uint32 pa, uint32 pt) {
+ it->_estPolyA = pa;
+ it->_estPolyT = pt;
+}
+
+void
+sim4polishBuilder::setESTdefline(char *defline) {
+ if (it == 0L) {
+ fprintf(stderr, "sim4polishBuilder::setESTdefline()-- no polish to build; create() not called\n");
+ return;
+ }
+ delete [] it->_estDefLine;
+ it->_estDefLine = new char [strlen(defline) + 1];
+ memcpy(it->_estDefLine, defline, sizeof(char) * (strlen(defline) + 1));
+}
+
+
+void
+sim4polishBuilder::setGENdefline(char *defline) {
+ if (it == 0L) {
+ fprintf(stderr, "sim4polishBuilder::setGENdefline()-- no polish to build; create() not called\n");
+ return;
+ }
+ delete [] it->_genDefLine;
+ it->_genDefLine = new char [strlen(defline) + 1];
+ memcpy(it->_genDefLine, defline, sizeof(char) * (strlen(defline) + 1));
+}
+
+
+void
+sim4polishBuilder::setNumberOfMatches(uint32 nummatches, uint32 nummatchesN) {
+ if (it == 0L) {
+ fprintf(stderr, "sim4polishBuilder::setNumberOfMatches()-- no polish to build; create() not called\n");
+ return;
+ }
+ it->_numMatches = nummatches;
+ it->_numMatchesN = nummatchesN;
+}
+
+
+void
+sim4polishBuilder::setPercentIdentity(uint32 id) {
+ if (it == 0L) {
+ fprintf(stderr, "sim4polishBuilder::setPercentIdentitysetPercentIdentity()-- no polish to build; create() not called\n");
+ return;
+ }
+ it->_percentIdentity = id;
+}
+
+
+void
+sim4polishBuilder::setMatchOrientation(char o) {
+ if (it == 0L) {
+ fprintf(stderr, "sim4polishBuilder::setMatchOrientation()-- no polish to build; create() not called\n");
+ return;
+ }
+ switch (o) {
+ case SIM4_MATCH_ERROR:
+ case SIM4_MATCH_FORWARD:
+ case SIM4_MATCH_COMPLEMENT:
+ it->_matchOrientation = o;
+ break;
+ default:
+ fprintf(stderr, "sim4polishBuilder::setMatchOrientation()-- invalid match orientation\n");
+ break;
+ }
+}
+
+void
+sim4polishBuilder::setStrandOrientation(char o) {
+ if (it == 0L) {
+ fprintf(stderr, "sim4polishBuilder::setStrandOrientation()-- no polish to build; create() not called\n");
+ return;
+ }
+ switch (o) {
+ case SIM4_STRAND_ERROR:
+ case SIM4_STRAND_POSITIVE:
+ case SIM4_STRAND_NEGATIVE:
+ case SIM4_STRAND_UNKNOWN:
+ case SIM4_STRAND_INTRACTABLE:
+ case SIM4_STRAND_FAILED:
+ it->_strandOrientation = o;
+ break;
+ default:
+ fprintf(stderr, "sim4polishBuilder::setStrandOrientation()-- invalid match orientation\n");
+ break;
+ }
+}
+
+
+void
+sim4polishBuilder::addExon(uint32 estlo, uint32 esthi,
+ uint32 genlo, uint32 genhi,
+ uint32 nummatches, uint32 nummatchesN, uint32 percentid,
+ char intronorientation) {
+ if (it == 0L) {
+ fprintf(stderr, "sim4polishBuilder::addExon()-- no polish to build; create() not called\n");
+ return;
+ }
+
+ // If we need more space for exons, reallocate the list of pointers
+ //
+ if (exPos >= exMax) {
+ exMax *= 2;
+ sim4polishExon **t = new sim4polishExon* [exMax];
+ memcpy(t, ex, exPos * sizeof(sim4polishExon *));
+ delete [] ex;
+ ex = t;
+ for (uint32 i=exPos; i<exMax; i++)
+ ex[i] = 0L;
+ }
+
+ if (ex[exPos] == 0L) {
+ ex[exPos] = new sim4polishExon;
+ } else {
+ // Just in case someone didn't clean up after themselves.
+ delete [] ex[exPos]->_estAlignment;
+ delete [] ex[exPos]->_genAlignment;
+ }
+
+ ex[exPos]->_estAlignment = 0L;
+ ex[exPos]->_genAlignment = 0L;
+
+ ex[exPos]->_estFrom = estlo;
+ ex[exPos]->_estTo = esthi;
+ ex[exPos]->_genFrom = genlo + it->_genRegionOffset;
+ ex[exPos]->_genTo = genhi + it->_genRegionOffset;
+ ex[exPos]->_numMatches = nummatches;
+ ex[exPos]->_numMatchesN = nummatchesN;
+ ex[exPos]->_percentIdentity = percentid;
+ ex[exPos]->_intronOrientation = intronorientation;
+
+ ex[exPos]->_estAlignment = 0L;
+ ex[exPos]->_genAlignment = 0L;
+
+ exPos++;
+}
+
+
+void
+sim4polishBuilder::addExonAlignment(char *estalign,
+ char *genalign) {
+ if (it == 0L) {
+ fprintf(stderr, "sim4polishBuilder::addExonAlignment()-- no polish to build; create() not called\n");
+ return;
+ }
+
+ if (exAli >= exPos) {
+ fprintf(stderr, "sim4polishBuilder::addExonAlignment()-- tried to add alignment for exon %u which doesn't exist\n", exAli);
+ exit(1);
+ }
+
+ ex[exAli]->_estAlignment = (char *)memdup(estalign, (strlen(estalign) + 1) * sizeof(char));
+ ex[exAli]->_genAlignment = (char *)memdup(genalign, (strlen(genalign) + 1) * sizeof(char));
+
+ exAli++;
+}
+
+sim4polish*
+sim4polishBuilder::release(void) {
+ sim4polish *retval = it;
+
+ if (it == 0L) {
+ fprintf(stderr, "sim4polishBuilder::release()-- no polish to build; create() not called\n");
+ return(0L);
+ }
+
+ if (exPos == 0)
+ return(0L);
+
+ it->_numCovered = 0;
+ it->_numExons = exPos;
+ it->_exons = new sim4polishExon [exPos];
+
+ for (uint32 i=0; i<exPos; i++) {
+ memcpy(it->_exons + i, ex[i], sizeof(sim4polishExon));
+ ex[i]->_estAlignment = 0L; // Owned by 'it' now
+ ex[i]->_genAlignment = 0L;
+
+ it->_numCovered += (ex[i]->_estTo - ex[i]->_estFrom + 1);
+ }
+
+ // Last, compute the querySeqIdentity using other fields (like our
+ // just updated numCovered).
+ //
+ it->_querySeqIdentity = it->s4p_percentCoverageApprox();
+
+ it = 0L;
+
+ exPos = 0;
+ exAli = 0;
+
+ return(retval);
+}
diff --git a/libsim4/sim4polish/sim4polishBuilder.H b/libsim4/sim4polish/sim4polishBuilder.H
new file mode 100644
index 0000000..9ddda2d
--- /dev/null
+++ b/libsim4/sim4polish/sim4polishBuilder.H
@@ -0,0 +1,43 @@
+#ifndef SIM4_POLISH_BUILDER_H
+#define SIM4_POLISH_BUILDER_H
+
+#include "sim4polish.H"
+
+class sim4polishBuilder {
+public:
+ sim4polishBuilder();
+ ~sim4polishBuilder();
+
+ void create(uint32 estid, uint32 estlen,
+ uint32 genid, uint32 genlo, uint32 genhi);
+
+ void setPolyTails(uint32 pa,
+ uint32 pt);
+
+ void setESTdefline(char *defline);
+ void setGENdefline(char *defline);
+
+ void setNumberOfMatches(uint32 nummatches, uint32 nummatchesN);
+ void setPercentIdentity(uint32 id);
+ void setMatchOrientation(char o);
+ void setStrandOrientation(char o);
+
+ void addExon(uint32 estlo, uint32 esthi,
+ uint32 genlo, uint32 genhi,
+ uint32 nummatches, uint32 nummatchesN, uint32 percentid,
+ char intronorientation);
+
+ void addExonAlignment(char *estalign,
+ char *genalign);
+
+ sim4polish *release(void);
+private:
+ sim4polish *it;
+
+ uint32 exMax; // maximum number of exons available
+ uint32 exPos; // next exon
+ uint32 exAli; // next exon without alignment
+ sim4polishExon **ex;
+};
+
+#endif // SIM4_POLISH_BUILDER_H
diff --git a/libsim4/sim4polish/sim4polishFile.C b/libsim4/sim4polish/sim4polishFile.C
new file mode 100644
index 0000000..ac3f584
--- /dev/null
+++ b/libsim4/sim4polish/sim4polishFile.C
@@ -0,0 +1,317 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include "sim4polishFile.H"
+
+
+// Global pointer used during construction of the index.
+// (polishRecordSortArray)
+//
+sim4polishFile::polishRecord *__prsa;
+
+int
+__prsaEST(const void *a, const void *b) {
+ uint32 aid = __prsa[ *((uint32*)a) ]._ESTiid;
+ uint32 bid = __prsa[ *((uint32*)b) ]._ESTiid;
+
+ if (aid < bid) return(-1);
+ if (aid > bid) return(1);
+ return(0);
+}
+
+int
+__prsaGEN(const void *a, const void *b) {
+ uint32 aid = __prsa[ *((uint32*)a) ]._GENiid;
+ uint32 bid = __prsa[ *((uint32*)b) ]._GENiid;
+
+ if (aid < bid) return(-1);
+ if (aid > bid) return(1);
+ return(0);
+}
+
+
+
+
+sim4polishFile::sim4polishFile(char *path, sim4polishStyle style) {
+
+ _path = new char [strlen(path) + 1];
+ strcpy(_path, path);
+
+ _file = new readBuffer(path);
+
+ _style = style;
+
+ _polishRecordLen = 0;
+ _polishRecordMax = 0;
+ _polishRecord = 0L;
+ _polishRecordEST = 0L;
+ _polishRecordGEN = 0L;
+
+ _maxEST = 0;
+ _maxGEN = 0;
+ _ESTiidLocation = 0L;
+ _GENiidLocation = 0L;
+}
+
+
+sim4polishFile::~sim4polishFile() {
+ delete [] _path;
+ delete [] _polishRecord;
+ delete [] _polishRecordEST;
+ delete [] _polishRecordGEN;
+ delete [] _ESTiidLocation;
+ delete [] _GENiidLocation;
+}
+
+
+sim4polishList*
+sim4polishFile::getEST(uint32 iid) {
+ sim4polishList *l = new sim4polishList();
+
+ if (iid >= _maxEST)
+ //fprintf(stderr, "Invalid EST iid "uint32FMT", max is "uint32FMT"\n", iid, _maxEST), exit(1);
+ return(l);
+
+ sim4polish *p = 0L;
+ uint32 i = _ESTiidLocation[iid];
+
+ if (i != ~uint32ZERO) {
+ setPosition(_polishRecordEST[i]);
+
+ p = new sim4polish(_file, _style);
+
+ while ((p) && (p->_numExons > 0) && (p->_estID == iid)) {
+ l->push(p);
+ i++;
+ setPosition(_polishRecordEST[i]);
+ p = new sim4polish(_file, _style);
+ }
+
+ delete p;
+ }
+
+ return(l);
+}
+
+
+sim4polishList*
+sim4polishFile::getGEN(uint32 iid, uint32 lo, uint32 hi) {
+ fprintf(stderr, "sim4polishFile::getGEN() not implemented. Sorry.\n");
+ exit(1);
+ return(0L);
+}
+
+
+sim4polish*
+sim4polishFile::getNext(void) {
+ return(new sim4polish(_file, _style));
+}
+
+
+void
+sim4polishFile::setPosition(uint32 ordinal) {
+
+ if (_polishRecord == 0L)
+ buildIndex();
+
+ if (ordinal >= _polishRecordLen)
+ fprintf(stderr, "Failed to reposition %s to record "uint32FMT", only "uint32FMT" records\n", _path, ordinal, _polishRecordLen), exit(1);
+
+ _file->seek(_polishRecord[ordinal]._fileposition);
+}
+
+
+void
+sim4polishFile::loadIndex(void) {
+ char magic[8] = {0};
+ char cigam[8] = { 's', '4', 'p', 'F', 'i', 'l', 'e', '1'};
+ int len = strlen(_path) + 32;
+ char *nam = new char [len];
+
+ sprintf(nam, "%s.sim4polishFile", _path);
+
+ if (fileExists(nam)) {
+ errno = 0;
+ FILE *F = fopen(nam, "r");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", nam, strerror(errno)), exit(1);
+
+ fread(&magic, sizeof(char), 8, F);
+ if (strncmp(magic, cigam, 8) != 0)
+ fprintf(stderr, "Failed to open '%s': Not a sim4polishFile!\n", nam), exit(1);
+
+ fread(&_polishRecordLen, sizeof(uint32), 1, F);
+
+ _polishRecord = new polishRecord [_polishRecordLen];
+ _polishRecordEST = new uint32 [_polishRecordLen];
+ _polishRecordGEN = new uint32 [_polishRecordLen];
+
+ fread( _polishRecord, sizeof(polishRecord), _polishRecordLen, F);
+ fread( _polishRecordEST, sizeof(uint32), _polishRecordLen, F);
+ fread( _polishRecordGEN, sizeof(uint32), _polishRecordLen, F);
+
+ fread(&_maxEST, sizeof(uint32), 1, F);
+ fread(&_maxGEN, sizeof(uint32), 1, F);
+
+ _ESTiidLocation = new uint32 [_maxEST];
+ _GENiidLocation = new uint32 [_maxGEN];
+
+ fread( _ESTiidLocation, sizeof(uint32), _maxEST, F);
+ fread( _GENiidLocation, sizeof(uint32), _maxGEN, F);
+
+ if (errno)
+ fprintf(stderr, "Failed to read '%s': %s\n", nam, strerror(errno)), exit(1);
+
+ fclose(F);
+ }
+
+ delete [] nam;
+}
+
+
+void
+sim4polishFile::saveIndex(void) {
+ char cigam[8] = { 's', '4', 'p', 'F', 'i', 'l', 'e', '1'};
+ int len = strlen(_path) + 32;
+ char *nam = new char [len];
+
+ sprintf(nam, "%s.sim4polishFile", _path);
+
+ errno = 0;
+ FILE *F = fopen(nam, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", nam, strerror(errno)), exit(1);
+
+ fwrite(&cigam, sizeof(char), 8, F);
+
+ fwrite(&_polishRecordLen, sizeof(uint32), 1, F);
+ fwrite( _polishRecord, sizeof(polishRecord), _polishRecordLen, F);
+ fwrite( _polishRecordEST, sizeof(uint32), _polishRecordLen, F);
+ fwrite( _polishRecordGEN, sizeof(uint32), _polishRecordLen, F);
+
+ fwrite(&_maxEST, sizeof(uint32), 1, F);
+ fwrite(&_maxGEN, sizeof(uint32), 1, F);
+ fwrite( _ESTiidLocation, sizeof(uint32), _maxEST, F);
+ fwrite( _GENiidLocation, sizeof(uint32), _maxGEN, F);
+
+ if (errno)
+ fprintf(stderr, "Failed to write '%s': %s\n", nam, strerror(errno)), exit(1);
+
+ fclose(F);
+
+ delete [] nam;
+}
+
+
+void
+sim4polishFile::buildIndex(void) {
+
+ loadIndex();
+
+ if (_polishRecord == 0L) {
+ fprintf(stderr, "sim4polishFile::buildIndex()-- building index for '%s'\n", _path);
+
+ _file->seek(0);
+
+ // Allocate a bunch of space for stuff
+ //
+ _polishRecordLen = 0;
+ _polishRecordMax = 3355443; // ~128MB for all three
+ _polishRecord = new polishRecord [_polishRecordMax];
+
+
+ // Read all polishes, storing stuff, reallocating more space if
+ // needed.
+ //
+ off_t fp = _file->tell();
+ sim4polish *p = new sim4polish(_file, _style);
+
+ while (p) {
+ if (_polishRecordLen >= _polishRecordMax) {
+ _polishRecordMax *= 2;
+ polishRecord *n = new polishRecord [_polishRecordMax];
+ memcpy(n, _polishRecord, sizeof(polishRecord) * _polishRecordLen);
+ delete [] _polishRecord;
+ _polishRecord = n;
+ }
+
+ _polishRecord[_polishRecordLen]._fileposition = fp;
+ _polishRecord[_polishRecordLen]._ESTiid = p->_estID;
+ _polishRecord[_polishRecordLen]._GENiid = p->_genID;
+ _polishRecord[_polishRecordLen]._GENlo = p->_exons[0]._genFrom;
+ _polishRecord[_polishRecordLen]._GENhi = p->_exons[p->_numExons-1]._genTo;
+ _polishRecordLen++;
+
+ if ((_polishRecordLen & 0xfff) == 0) {
+ fprintf(stderr, "polishes: "uint32FMT"\r", _polishRecordLen);
+ fflush(stderr);
+ }
+
+ delete p;
+
+ fp = _file->tell();
+ if (_file->eof())
+ p = NULL;
+ else
+ p = new sim4polish(_file, _style);
+ }
+
+
+ // Sort the indices by EST and GEN iid's. Pain in the butt, we
+ // need to access _polishRecord to sort *EST and *GEN, but
+ // qsort() doesn't support that.
+ //
+ // Three solutions:
+ // 1) use a custom sort
+ // 2) use a global pointer to _polishRecord
+ // 3) use a temporary array holding the sort key and position
+ //
+ _polishRecordEST = new uint32 [_polishRecordLen];
+ _polishRecordGEN = new uint32 [_polishRecordLen];
+
+ for (uint32 i=0; i<_polishRecordLen; i++)
+ _polishRecordEST[i] = _polishRecordGEN[i] = i;
+
+ __prsa = _polishRecord;
+ qsort(_polishRecordEST, _polishRecordLen, sizeof(uint32), __prsaEST);
+ qsort(_polishRecordGEN, _polishRecordLen, sizeof(uint32), __prsaGEN);
+ __prsa = 0L;
+
+
+ // Scan the sorted lists, record the first location of each iid
+ //
+ _maxEST = _polishRecord[ _polishRecordEST[_polishRecordLen-1] ]._ESTiid + 1;
+ _maxGEN = _polishRecord[ _polishRecordGEN[_polishRecordLen-1] ]._GENiid + 1;
+ _ESTiidLocation = new uint32 [_maxEST];
+ _GENiidLocation = new uint32 [_maxGEN];
+
+ for (uint32 i=0; i<_maxEST; i++)
+ _ESTiidLocation[i] = ~uint32ZERO;
+ for (uint32 i=0; i<_polishRecordLen; i++) {
+ uint32 iid = _polishRecord[ _polishRecordEST[i] ]._ESTiid;
+ if (_ESTiidLocation[iid] == ~uint32ZERO)
+ _ESTiidLocation[iid] = i;
+ }
+
+ for (uint32 i=0; i<_maxGEN; i++)
+ _GENiidLocation[i] = ~uint32ZERO;
+ for (uint32 i=0; i<_polishRecordLen; i++) {
+ uint32 iid = _polishRecord[ _polishRecordGEN[i] ]._GENiid;
+ if (_GENiidLocation[iid] == ~uint32ZERO)
+ _GENiidLocation[iid] = i;
+ }
+
+
+ // Save the index
+ //
+ saveIndex();
+
+ // Be nice, reposition the file to the start.
+ //
+ _file->seek(0);
+ }
+}
+
+
diff --git a/libsim4/sim4polish/sim4polishFile.H b/libsim4/sim4polish/sim4polishFile.H
new file mode 100644
index 0000000..3acda68
--- /dev/null
+++ b/libsim4/sim4polish/sim4polishFile.H
@@ -0,0 +1,107 @@
+#ifndef SIM4POLISHFILE
+#define SIM4POLISHFILE
+
+#include "sim4polish.H"
+#include "sim4polishList.H"
+
+//
+// (original motivation)
+// Needed: something to return polishes from a file without reading
+// in everything, and without doing lots of I/O.
+//
+// polishFile A(name)
+// sim4polishList l = A.getEST(id);
+// sim4polishList l = A.getGEN(id, lo=0, hi=0);
+//
+// sim4polish x = A.getNext();
+//
+// A.seek(polish-ordinal to seek to);
+//
+// On first invocation, it reads the whole file, building a map of
+// ESTid, GENid to file position. Maybe also store GENlo and GENhi
+// in this map. Map is stored on disk as "file.polishFileMap"
+//
+
+
+//
+// The first call to getEST(), getGEN() or seek() will cause a
+// while file scan to be performed. From this, we build a list of
+// all the polishes, and their iid's. This index is cached on disk
+// as '${path}.sim4polishFile'
+//
+
+class sim4polishFile {
+public:
+ sim4polishFile(char *path, sim4polishStyle style);
+ ~sim4polishFile();
+
+ // Number of EST's actually present
+ //
+ uint32 maxIID(void) {
+ return(_maxEST);
+ };
+
+ // Returns all matches with:
+ // ESTid == iid
+ // GENid == iid AND that overlap the range lo...hi
+ //
+ // N.B. getNext() doesn't really mean much after these.
+ //
+ sim4polishList *getEST(uint32 iid);
+ sim4polishList *getGEN(uint32 iid, uint32 lo=0, uint32 hi=~uint32ZERO);
+
+ // Returns the next polish in the file.
+ //
+ sim4polish *getNext(void);
+
+ // Positions the file pointer to the ordinal'th polish in the file.
+ // getNext() will then return that polish.
+ //
+ void setPosition(uint32 ordinal);
+
+ // 24 bytes/record. A typical large EST set has ~10 million
+ // matches, this fits into 240MB. That's reasonable.
+ //
+ // I really hate to make this public, but sorting needs it.
+ //
+ struct polishRecord {
+ off_t _fileposition;
+ uint32 _ESTiid;
+ uint32 _GENiid;
+ uint32 _GENlo;
+ uint32 _GENhi;
+ };
+
+private:
+ char *_path;
+ readBuffer *_file;
+
+ // One record for each polish, in the order they are in the file.
+ // One integer pointer into _polishRecord, sorted by either the EST
+ // or GEN iid.
+ //
+ uint32 _polishRecordLen;
+ uint32 _polishRecordMax;
+ polishRecord *_polishRecord;
+ uint32 *_polishRecordEST;
+ uint32 *_polishRecordGEN;
+
+ sim4polishStyle _style;
+
+ // One integer pointer for each iid we've seen. Pointer into
+ // _polishRecordEST or _polishRecordGEN. If memory is tight, we
+ // could binary search those arrays instead.
+ //
+ uint32 _maxEST;
+ uint32 _maxGEN;
+ uint32 *_ESTiidLocation;
+ uint32 *_GENiidLocation;
+
+ void loadIndex(void);
+ void saveIndex(void);
+ void buildIndex(void);
+};
+
+
+
+#endif // SIM4POLISHFILE
diff --git a/libsim4/sim4polish/sim4polishList.C b/libsim4/sim4polish/sim4polishList.C
new file mode 100644
index 0000000..eb25400
--- /dev/null
+++ b/libsim4/sim4polish/sim4polishList.C
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <math.h>
+#include "bio++.H"
+
+#include "sim4polishList.H"
+#include "sim4polishReader.H"
+
+sim4polishList::sim4polishList() {
+ len = 0;
+ max = 4;
+ list = new sim4polish* [max];
+}
+
+sim4polishList::sim4polishList(char const *filename) {
+ len = 0;
+ max = 4;
+ list = new sim4polish* [max];
+
+ sim4polishReader *R = new sim4polishReader(filename);
+ sim4polish *p = 0L;
+
+ while (R->nextAlignment(p))
+ push(p);
+
+ delete R;
+}
+
+sim4polishList::~sim4polishList() {
+ for (uint32 i=0; i<len; i++)
+ delete list[i];
+ delete [] list;
+}
+
+void
+sim4polishList::push(sim4polish *p) {
+
+ if (p == 0L)
+ return;
+
+ if (len >= max) {
+ max *= 2;
+ sim4polish **l = new sim4polish* [max];
+ memcpy(l, list, len * sizeof(sim4polish*));
+ delete [] list;
+ list = l;
+ }
+
+ list[len++] = p;
+}
+
+void
+sim4polishList::remove(uint32 i) {
+
+ if (i >= len)
+ return;
+
+ delete list[i];
+
+ len--;
+ for ( ; i < len; i++)
+ list[i] = list[i+1];
+}
+
+
+void
+sim4polishList::sortBycDNAIID(void) {
+ qsort(list, len, sizeof(sim4polish *), s4p_estIDcompare);
+}
+
+void
+sim4polishList::sortByGenomicIID(void) {
+ qsort(list, len, sizeof(sim4polish *), s4p_genIDcompare);
+}
+
+
+void
+sim4polishList::filterByQuality(uint32 minI, uint32 minC) {
+ uint32 save = 0;
+ uint32 next = 0;
+
+ while (next < len) {
+ if ((list[next]->_percentIdentity >= minI) &&
+ (list[next]->_querySeqIdentity >= minC)) {
+ list[save++] = list[next++];
+ } else {
+ delete list[next];
+ list[next++] = 0L;
+ }
+ }
+
+ len = save;
+}
diff --git a/libsim4/sim4polish/sim4polishList.H b/libsim4/sim4polish/sim4polishList.H
new file mode 100644
index 0000000..405bad4
--- /dev/null
+++ b/libsim4/sim4polish/sim4polishList.H
@@ -0,0 +1,37 @@
+#ifndef SIM4_POLISH_LIST_H
+#define SIM4_POLISH_LIST_H
+
+#include "sim4polish.H"
+
+//
+// A list of sim4polishes
+//
+
+class sim4polishList {
+public:
+ sim4polishList();
+ sim4polishList(char const *filename);
+ ~sim4polishList();
+
+ void push(sim4polish *p);
+ void remove(uint32 i);
+
+ uint32 length(void) { return(len); };
+
+ sim4polish *operator[](uint32 i) { if (i >= len) return(0L); return(list[i]); };
+ sim4polish *get(uint32 i) { if (i >= len) return(0L); return(list[i]); };
+
+ void sortBycDNAIID(void);
+ void sortByGenomicIID(void);
+
+ // Removes polishes below the specified quality
+ void filterByQuality(uint32 minI, uint32 minC);
+
+private:
+ uint32 len;
+ uint32 max;
+ sim4polish **list;
+};
+
+
+#endif // SIM4_POLISH_LIST_H
diff --git a/libsim4/sim4polish/sim4polishReader.C b/libsim4/sim4polish/sim4polishReader.C
new file mode 100644
index 0000000..3d170ec
--- /dev/null
+++ b/libsim4/sim4polish/sim4polishReader.C
@@ -0,0 +1,102 @@
+#include "sim4polishReader.H"
+
+#include "util++.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "sim4polishWriter.H"
+
+sim4polishReader::sim4polishReader(const char *name, sim4polishWriter *writer) {
+
+ if (name)
+ _rb = new readBuffer(name);
+ else
+ _rb = new readBuffer(writer->surrenderToReader());
+
+ // Attempt to decide on the style of the input, based on the first line.
+
+ char firstLine[1024];
+ splitToWords firstWords;
+
+ _rb->read(firstLine, 1024, '\n');
+ _rb->seek(0);
+
+ // This fixes a bug in split to words, that white space at the end isn't trimmed.
+ chomp(firstLine);
+
+ //fprintf(stderr, "sim4polishReader()-- '%s'\n", firstLine);
+
+ firstWords.split(firstLine);
+
+ if (strcmp(firstWords[0], "sim4begin") == 0) {
+ _style = sim4polishS4DB;
+
+ } else if (strcmp(firstWords[0], "##gff-version") == 0) {
+ if (strcmp(firstWords[1], "3") == 0)
+ _style = sim4polishGFF3;
+ else
+ fprintf(stderr, "sim4polishReader()-- GFF format version %s not supported; only version 3 is supported.\n",
+ firstWords[1]), exit(1);
+
+ } else if ((strcmp(firstWords[0], "!format") == 0) &&
+ (strcmp(firstWords[1], "atac") == 0)) {
+ if (strcmp(firstWords[2], "1.0") == 0)
+ _style = sim4polishATAC;
+ else
+ fprintf(stderr, "sim4polishReader()-- ATAC format version %s not supported; only version 1.0 is supported.\n",
+ firstWords[2]), exit(1);
+
+ } else {
+ fprintf(stderr, "sim4polishReader()-- Failed to open '%s' for reading: unknown format.\n",
+ _rb->filename()), exit(1);
+ }
+}
+
+
+sim4polishReader::~sim4polishReader() {
+ delete _rb;
+ _rb = 0L;
+}
+
+
+sim4polish *
+sim4polishReader::nextAlignment(void) {
+ sim4polish *p = 0L;
+
+ if (_rb->eof())
+ return(p);
+
+ p = new sim4polish(_rb, _style);
+
+ if (p->_numExons == 0) {
+ delete p;
+ p = 0L;
+ }
+
+ return(p);
+}
+
+
+bool
+sim4polishReader::nextAlignment(sim4polish * &p) {
+
+ delete p;
+ p = 0L;
+
+ if (_rb->eof())
+ return(false);
+
+ p = new sim4polish(_rb, _style);
+
+ if (p->_numExons == 0) {
+ delete p;
+ p = 0L;
+ return(false);
+ }
+
+ return(true);
+}
diff --git a/libsim4/sim4polish/sim4polishReader.H b/libsim4/sim4polish/sim4polishReader.H
new file mode 100644
index 0000000..5a03d37
--- /dev/null
+++ b/libsim4/sim4polish/sim4polishReader.H
@@ -0,0 +1,29 @@
+#ifndef SIM4POLISHREADER
+#define SIM4POLISHREADER
+
+#include "sim4polish.H"
+#include "util++.H"
+
+// Simple class to read the contents of a file of alignments. The file can be either sim4db, gff3
+// or atac format. No support for random access is provided, just sequential access.
+
+class sim4polishWriter;
+
+class sim4polishReader {
+public:
+ sim4polishReader(const char *name, sim4polishWriter *writer=0L);
+ ~sim4polishReader();
+
+ // Returns the next alignment in the file. NULL is returned if there are no more alignments.
+ //
+ sim4polish *nextAlignment(void);
+ bool nextAlignment(sim4polish * &p);
+
+ sim4polishStyle getsim4polishStyle(void) { return _style; }
+
+private:
+ readBuffer *_rb;
+ sim4polishStyle _style;
+};
+
+#endif // SIM4POLISHREADER
diff --git a/libsim4/sim4polish/sim4polishWriter.C b/libsim4/sim4polish/sim4polishWriter.C
new file mode 100644
index 0000000..e59ded0
--- /dev/null
+++ b/libsim4/sim4polish/sim4polishWriter.C
@@ -0,0 +1,181 @@
+#include "sim4polishWriter.H"
+
+#include "util++.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+static
+const
+char base64[65] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-";
+
+sim4polishWriter::sim4polishWriter(const char *name, sim4polishStyle style, bool hidden) {
+
+ if (hidden) {
+ // We are supposed to be a hidden file.
+ strcpy(_otName, "(hidden)");
+ _otFile = makeTempFile(NULL);
+
+ } else if ((name == 0L) || ((name[0] == '-') && (name[1] == 0))) {
+ // We are stdout.
+ strcpy(_otName, "(stdout)");
+ _otFile = stdout;
+
+ } else {
+ // Nope, just a regular ol' file.
+ if (strlen(name) > FILENAME_MAX)
+ fprintf(stderr, "sim4polishWriter()-- Failed to open '%s' for writing: file name too long.\n",
+ name), exit(1);
+
+ strncpy(_otName, name, FILENAME_MAX);
+
+ errno = 0;
+ _otFile = fopen(name, "w");
+ if (errno)
+ fprintf(stderr, "sim4polishWriter()-- Failed to open '%s' for writing: %s\n",
+ _otName, strerror(errno)), exit(1);
+ }
+
+ _style = style;
+
+ switch (_style) {
+ case sim4polishS4DB: s4p_putHeaderS4DB(); break;
+ case sim4polishGFF3: s4p_putHeaderGFF3(); break;
+ case sim4polishATAC: s4p_putHeaderATAC(); break;
+ }
+
+ memset(_sourceName, 0, sizeof(char) * 32);
+ memset(_matchIDprefix, 0, sizeof(char) * 32);
+ memset(_matchIDsalt, 0, sizeof(char) * 8);
+
+ _matchID = 0;
+
+ // Construct a match ID salt based on the current time and process ID. We make a 48-bit
+ // number from the combination of process ID and curent time, then convert that to base-64.
+
+ uint64 saltTime = (uint64)getTime(); // returns a double, fraction of seconds
+ uint64 saltPID = (uint64)getpid();
+
+ uint64 saltInteger = (saltPID << 32) | (saltTime);
+ uint64 saltMask = uint64MASK(6);
+
+ _matchIDsalt[0] = base64[saltInteger & saltMask]; saltInteger >>= 6; // 6 bits
+ _matchIDsalt[1] = base64[saltInteger & saltMask]; saltInteger >>= 6; // 12 bits
+ _matchIDsalt[2] = base64[saltInteger & saltMask]; saltInteger >>= 6; // 18 bits
+ _matchIDsalt[3] = base64[saltInteger & saltMask]; saltInteger >>= 6; // 24 bits
+ _matchIDsalt[4] = base64[saltInteger & saltMask]; saltInteger >>= 6; // 30 bits
+ _matchIDsalt[5] = base64[saltInteger & saltMask]; saltInteger >>= 6; // 36 bits
+ _matchIDsalt[6] = base64[saltInteger & saltMask]; saltInteger >>= 6; // 42 bits
+ _matchIDsalt[7] = 0;
+
+#if DEBUG_WRITER
+ fprintf(stderr, "SALT: "uint64FMT" + "uint64FMT" = %s\n",
+ saltPID, saltTime, _matchIDsalt);
+#endif
+}
+
+void
+sim4polishWriter::s4p_putHeaderS4DB() {
+ return;
+}
+
+void
+sim4polishWriter::s4p_putHeaderATAC() {
+ return;
+}
+
+void
+sim4polishWriter::s4p_putHeaderGFF3() {
+ fputs( "##gff-version 3\n", _otFile);
+
+ return;
+}
+
+sim4polishWriter::~sim4polishWriter() {
+
+ if (strcmp(_otName, "(hidden)") == 0) {
+ if (_otFile)
+ fprintf(stderr, "sim4polishWriter()-- WARNING: Hidden output file was lost; surrenderToReader() never called.\n");
+ } else {
+ errno = 0;
+ if (_otFile)
+ fclose(_otFile);
+ if (errno)
+ fprintf(stderr, "sim4polishWriter()-- WARNING: Failed to close '%s': %s\n",
+ _otName, strerror(errno));
+ }
+
+ _otFile = NULL;
+}
+
+
+FILE *
+sim4polishWriter::surrenderToReader(void) {
+ FILE *retval = _otFile;
+
+ _otFile = 0L;
+
+ fflush(retval);
+ rewind(retval);
+ return(retval);
+}
+
+
+void
+sim4polishWriter::setSourceName(const char *sourceName) {
+
+ // Find the last slash, if any.
+ const char *lastSlash = strrchr(sourceName, '/');
+
+ // If found, advance one letter to the first letter in the name, otherwise
+ // reset lastSlash to the first letter in the sourceName.
+ if (lastSlash)
+ lastSlash++;
+ else
+ lastSlash = sourceName;
+
+ if (lastSlash[0] == 0)
+ fprintf(stderr, "sim4polishWriter()-- source name is empty, or ends in a '/'; no source name used.\n");
+
+ if (strlen(lastSlash) > 32)
+ fprintf(stderr, "sim4polishWriter()-- source name too long, truncating to 31 letters.\n");
+
+ strncpy(_sourceName, lastSlash, 32);
+ _sourceName[31] = 0;
+}
+
+
+void
+sim4polishWriter::setMatchIDPrefix(const char *prefix) {
+
+ if (strlen(prefix) > 32)
+ fprintf(stderr, "sim4polishWriter()-- ID prefix too long, truncating to 31 letters.\n");
+
+ strncpy(_matchIDprefix, prefix, 32);
+ _matchIDprefix[31] = 0;
+}
+
+
+void
+sim4polishWriter::writeAlignment(sim4polish *out) {
+ char *str = 0L;
+
+ switch (_style) {
+ case sim4polishS4DB:
+ str = out->s4p_polishToStringS4DB();
+ break;
+ case sim4polishGFF3:
+ str = out->s4p_polishToStringGFF3();
+ break;
+ case sim4polishATAC:
+ str = out->s4p_polishToStringATAC();
+ break;
+ }
+
+ fputs(str, _otFile);
+
+ delete [] str;
+}
diff --git a/libsim4/sim4polish/sim4polishWriter.H b/libsim4/sim4polish/sim4polishWriter.H
new file mode 100644
index 0000000..934069c
--- /dev/null
+++ b/libsim4/sim4polish/sim4polishWriter.H
@@ -0,0 +1,62 @@
+#ifndef SIM4POLISHWRITER
+#define SIM4POLISHWRITER
+
+#include "sim4polish.H"
+
+// Simple class for writing a file of alignments. The file can be either sim4db, gff3 or atac
+// format. This class makes sure that the file has a header (if needed) and takes care of
+// generating unique IDs for each gff3 file.
+
+class sim4polishWriter {
+public:
+ sim4polishWriter(const char *name, sim4polishStyle style, bool hidden=false);
+ ~sim4polishWriter();
+
+private:
+ // If this was opened as a 'hidden' file, this is the only way to ever see the results again.
+ // The destructor complains if this method is never called on a hidden file.
+ //
+ FILE *surrenderToReader(void);
+
+ void s4p_putHeaderS4DB();
+ void s4p_putHeaderGFF3();
+ void s4p_putHeaderATAC();
+
+ friend class sim4polishReader;
+
+public:
+ // The source name is listed in column 2 of a GFF3. It is supposed to be the name of the
+ // program that generated these alignments.
+ //
+ // The source name MUST be shorter than 32 letters, and if it contains /'s (a path) only the
+ // last component is used.
+ //
+ void setSourceName(const char *sourceName);
+
+ // The match ID prefix is used in column 9, when constructing the file-unique ID for each
+ // alignment. If not set, it will default to the sourceName + a short salt derived from the
+ // current time and process id + an integer count starting at zero. The default is chosen so
+ // that the ID's in resulting files are more-or-less globally unique.
+ //
+ // The match ID prefix MUST be shorter than 32 letters.
+ //
+ void setMatchIDPrefix(const char *prefix);
+
+ // Add an alignment to the file.
+ //
+ void writeAlignment(sim4polish *out);
+
+private:
+ char _otName[FILENAME_MAX];
+ FILE *_otFile;
+
+ sim4polishStyle _style;
+
+ char _sourceName[32];
+
+ char _matchIDprefix[32];
+ char _matchIDsalt[8];
+ uint64 _matchID;
+};
+
+#endif // SIM4POLISHWRITER
diff --git a/libutil/Make.include b/libutil/Make.include
new file mode 100644
index 0000000..e3717c0
--- /dev/null
+++ b/libutil/Make.include
@@ -0,0 +1,62 @@
+# -*- makefile -*-
+
+$(eval $(call Include,$/mt19937ar/))
+$(eval $(call Include,$/kazlib/))
+
+src := $/bigQueue.C \
+ $/bigQueue.H \
+ $/bitOperations.h \
+ $/bitPackedArray.C \
+ $/bitPackedArray.H \
+ $/bitPackedFile.C \
+ $/bitPackedFile.H \
+ $/bitPacking.h \
+ $/eliasDeltaEncoding.h \
+ $/eliasGammaEncoding.h \
+ $/endianess.H \
+ $/fibonacciEncoding.h \
+ $/fibonacciNumbers.C \
+ $/file.c \
+ $/generalizedUnaryEncoding.h \
+ $/intervalList.H \
+ $/logMsg.H \
+ $/md5.c \
+ $/palloc.c \
+ $/qsort_mt.c \
+ $/readBuffer.C \
+ $/readBuffer.H \
+ $/recordFile.C \
+ $/recordFile.H \
+ $/speedCounter.C \
+ $/speedCounter.H \
+ $/splitToWords.H \
+ $/sweatShop.C \
+ $/sweatShop.H \
+ $/uint32List.H \
+ $/unaryEncoding.h \
+ $/util++.H \
+ $/util.c \
+ $/util.h
+
+# Broken
+# $/bzipBuffer.C
+# $/bzipBuffer.H
+
+# Executables
+# $/unaryEncodingTester.C
+
+
+$/.C_SRCS :=$(filter %.c,${src})
+$/.CXX_SRCS :=$(filter %.C,${src})
+$/.CXX_INCS :=$(filter %.H,${src}) $(filter %.h,${src})
+$/.CXX_LIBS :=$/libutil.a
+
+$/.CLEAN := $/*.o
+
+$/libutil.a: ${$/.C_SRCS:.c=.o} ${$/.CXX_SRCS:.C=.o} \
+ $/mt19937ar/mt19937ar.o \
+ $/kazlib/dict.o \
+ $/kazlib/except.o \
+ $/kazlib/hash.o \
+ $/kazlib/list.o \
+ $/kazlib/sfx.o
diff --git a/libutil/NOTES b/libutil/NOTES
new file mode 100644
index 0000000..cb1d6f2
--- /dev/null
+++ b/libutil/NOTES
@@ -0,0 +1,10 @@
+Various notes that should be turned into real documentation
+
+merStream
+ (need to check this)
+ posInSeq() is relative to the start of the current sequence.
+ posInStream() is relative to the start of the source file.
+
+Whitespace in the sequence / source files mess up position calculations.
+
+The FastAstream positions are NOT sequence positions.
diff --git a/libutil/bigQueue.C b/libutil/bigQueue.C
new file mode 100644
index 0000000..d425f57
--- /dev/null
+++ b/libutil/bigQueue.C
@@ -0,0 +1,343 @@
+#include "bigQueue.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+// Kaz Kylheku <kaz at ashi.footprints.net> library.
+#include "kazlib/dict.h"
+#include "kazlib/except.h"
+#include "kazlib/hash.h"
+#include "kazlib/list.h"
+#include "kazlib/sfx.h"
+
+// qsort and kazlib are incombatible. qsort passes a pointer to the data, kaz lib passes
+// the data (which it assumes is a pointer to begin with).
+
+
+void
+bigQueue::_initialize(int (*sortfcn)(const void *a, const void *b),
+ bool (*readfcn)(FILE *f, void *a),
+ bool (*writfcn)(FILE *f, void *a),
+ void (*killfcn)(void *a),
+ uint32 objectSize,
+ uint32 memoryToUse,
+ char *tmppath,
+ char *filename) {
+ _saveFile = 0L;
+ _tmpPath = 0L;
+
+ if (filename) {
+ _saveFile = new char [strlen(filename) + 1];
+ strcpy(_saveFile, filename);
+ }
+ if (tmppath) {
+ _tmpPath = new char [strlen(tmppath) + 1];
+ strcpy(_tmpPath, tmppath);
+ }
+
+ _sortFunction = sortfcn;
+ _writFunction = writfcn;
+ _readFunction = readfcn;
+ _killFunction = killfcn;
+
+ _objectSize = objectSize;
+ _memoryToUse = memoryToUse;
+
+ _maxOpenFiles = getdtablesize() - 8;
+ _numTemporaryFiles = 0;
+ _numMergeFiles = 0;
+
+ _temporaryFiles = new FILE* [_maxOpenFiles];
+
+ for (uint32 i=0; i<_maxOpenFiles; i++)
+ _temporaryFiles[i] = 0L;
+
+ // Open the first temporary file for writing.
+ //
+ _temporaryFiles[_numTemporaryFiles++] = makeTempFile(_tmpPath);
+
+ // XXX: It would be rather convenient if we could get another file
+ // handle given an existing handle (no, dup(2) doesn't do that).
+ // In particular, we want two file pointers, one for read, one for
+ // write.
+ //
+ //_inputFile = fdopen(dup(fileno(_temporaryFiles[0])), "w+");
+
+ _thingBuffer = new uint64 [_objectSize / 8 + 1];
+
+ _bufferMax = 0;
+ _bufferLen = 0;
+ _buffer = 0L;
+
+ if (_sortFunction) {
+ _bufferMax = (uint64)memoryToUse * 1024 * 1024 / ((uint64)sizeof(void *) + objectSize);
+ _bufferLen = 0;
+ _buffer = new void* [_bufferMax];
+ }
+}
+
+
+
+bigQueue::~bigQueue() {
+ delete [] _saveFile;
+ delete [] _tmpPath;
+
+ for (uint32 i=0; i<_numTemporaryFiles; i++)
+ fclose(_temporaryFiles[i]);
+
+ delete [] _temporaryFiles;
+
+ //fclose(_inputFile);
+
+ clearBuffer();
+}
+
+
+
+
+
+
+// Add elements to the end of the array.
+void
+bigQueue::add(void *thing) {
+
+ if (_buffer == 0L) {
+ if (_writFunction)
+ (*_writFunction)(_temporaryFiles[_numTemporaryFiles-1], thing);
+ else
+ fwrite(thing, _objectSize, 1, _temporaryFiles[_numTemporaryFiles-1]);
+ } else {
+
+ // No space in the buffer? Sort it, write it out and make a new
+ // one.
+ //
+ if (_bufferLen >= _bufferMax) {
+ sortAndWriteBuffer();
+
+ if (_numTemporaryFiles+1 >= _maxOpenFiles)
+ mergeTemporaryFiles();
+
+ _temporaryFiles[_numTemporaryFiles++] = makeTempFile(_tmpPath);
+ }
+
+ _buffer[_bufferLen++] = thing;
+ }
+}
+
+
+
+void
+bigQueue::sortAndWriteBuffer(void) {
+
+ if (_bufferLen > 0) {
+
+ // Sort!
+ //
+ qsort(_buffer, _bufferLen, sizeof(void *), _sortFunction);
+
+ // Write!
+ //
+ if (_writFunction) {
+ for (uint32 i=0; i<_bufferLen; i++)
+ (*_writFunction)(_temporaryFiles[_numTemporaryFiles-1], _buffer[i]);
+ } else {
+ for (uint32 i=0; i<_bufferLen; i++)
+ fwrite(_buffer[i], _objectSize, 1, _temporaryFiles[_numTemporaryFiles-1]);
+ }
+
+ // Flush and rewind the file!
+ //
+ fflush(_temporaryFiles[_numTemporaryFiles-1]);
+ ::rewind(_temporaryFiles[_numTemporaryFiles-1]);
+
+ clearBuffer();
+ }
+}
+
+
+void
+bigQueue::clearBuffer(void) {
+
+ if (_killFunction)
+ for (uint32 i=0; i<_bufferLen; i++)
+ (*_killFunction)(_buffer[i]);
+ else
+ for (uint32 i=0; i<_bufferLen; i++)
+ free(_buffer[i]);
+
+ _bufferLen = 0;
+}
+
+
+void
+bigQueue::mergeTemporaryFiles(void) {
+
+ if (_numTemporaryFiles > 1) {
+ dict_t *sorted;
+ dnode_t *nodes = new dnode_t [_maxOpenFiles];
+
+ // To be efficient, we need to maintain a sorted queue of the head
+ // elements of each temporary file. A red-black tree would do
+ // nicely, eh?
+ //
+ sorted = dict_create(DICTCOUNT_T_MAX, _sortFunction);
+
+ // Grab the first thing off each file, insert it into the dictionary.
+ // The 'key' is our chunk of data, and the 'value' is the file number
+ // it came from.
+ //
+ for (uint32 i=0; i<_numTemporaryFiles; i++) {
+ if (_temporaryFiles[i]) {
+
+ // Rewind all the temporary files. XXXX This is probably done
+ // already.
+ //
+ ::rewind(_temporaryFiles[i]);
+
+ void *thing = malloc(_objectSize);
+
+ if (_readFunction)
+ (*_readFunction)(_temporaryFiles[i], thing);
+ else
+ fread(thing, _objectSize, 1, _temporaryFiles[i]);
+
+ if (feof(_temporaryFiles[i])) {
+ fclose(_temporaryFiles[i]);
+ _temporaryFiles[i] = 0L;
+ } else {
+ // initialize the node with the value
+ dnode_init(&nodes[i], (void *)(unsigned long)i);
+
+ // insert the node into the tree using the key
+ dict_insert(sorted, &nodes[i], thing);
+ }
+ }
+ }
+
+ FILE *mergeFile = makeTempFile(_tmpPath);
+
+ // while there is stuff in the tree
+
+ while (dict_isempty(sorted) == 0) {
+
+ // pop the head element off, and print it
+ dnode_t *head = dict_first(sorted);
+
+ // XXX: should be const thing
+
+ void *thing = (void *)dnode_getkey(head);
+ long fileid = (long)dnode_get(head);
+
+ if (_writFunction)
+ (*_writFunction)(mergeFile, thing);
+ else
+ fwrite(thing, _objectSize, 1, mergeFile);
+
+ // delete the node from the tree
+ dict_delete(sorted, head);
+
+ // destroy the thing
+ if (_killFunction)
+ (*_killFunction)(thing);
+ else
+ free(thing);
+
+ // load the next element from the same file that the head was
+ // from (that's stored as the value of the head element)
+
+ thing = malloc(_objectSize);
+
+ if (_readFunction)
+ (*_readFunction)(_temporaryFiles[fileid], thing);
+ else
+ fread(thing, _objectSize, 1, _temporaryFiles[fileid]);
+
+ // if there was a next element in that file, insert it
+ // into the tree. if not, close the temporary file.
+ //
+ if (feof(_temporaryFiles[fileid])) {
+ fclose(_temporaryFiles[fileid]);
+ _temporaryFiles[fileid] = 0;
+ free(thing);
+ } else {
+ // initialize the node with the value
+ dnode_init(&nodes[fileid], (void *)fileid);
+
+ // insert the node into the tree using the key
+ dict_insert(sorted, &nodes[fileid], thing);
+ }
+ }
+
+ dict_free(sorted);
+ delete [] nodes;
+
+ _numTemporaryFiles = 1;
+ _temporaryFiles[0] = mergeFile;
+ }
+
+ ::rewind(_temporaryFiles[0]);
+
+#if 0
+ fclose(_inputFile);
+ errno = 0;
+ _inputFile = fdopen(dup(fileno(_temporaryFiles[0])), "w+");
+ if (errno)
+ fprintf(stderr, "bigQueue::mergeTemporaryFiles()-- _inputFile = fdopen() failed: %s\n", strerror(errno)), exit(1);
+
+ ::rewind(_inputFile);
+#endif
+}
+
+
+bool
+bigQueue::next(void) {
+
+ if (_readFunction) {
+ //(*_readFunction)(_inputFile, _thingBuffer);
+ (*_readFunction)(_temporaryFiles[0], _thingBuffer);
+ } else {
+ //fread(_thingBuffer, _objectSize, 1, _inputFile);
+ fread(_thingBuffer, _objectSize, 1, _temporaryFiles[0]);
+ }
+
+#if 0
+ if (feof(_inputFile))
+ return(false);
+#endif
+
+ if (feof(_temporaryFiles[0]))
+ return(false);
+
+ return(true);
+}
+
+
+void*
+bigQueue::get(void) {
+ return(_thingBuffer);
+}
+
+void
+bigQueue::rewind(void) {
+ //::rewind(_inputFile);
+ ::rewind(_temporaryFiles[0]);
+ next();
+}
+
+void
+bigQueue::save(char *filepath) {
+ fprintf(stderr, "bigQueue::save()-- not implemented.\n");
+}
+
+void
+bigQueue::sort(void) {
+ sortAndWriteBuffer();
+ mergeTemporaryFiles();
+}
+
+void
+bigQueue::flush(void) {
+ fflush(_temporaryFiles[_numTemporaryFiles-1]);
+}
diff --git a/libutil/bigQueue.H b/libutil/bigQueue.H
new file mode 100644
index 0000000..acef4e6
--- /dev/null
+++ b/libutil/bigQueue.H
@@ -0,0 +1,150 @@
+#ifndef BIGQUEUE_H
+#define BIGQUEUE_H
+
+#include "util++.H"
+
+// A disk-backed list of user-defined objects.
+//
+// At creation time, you can opt to have it sorted, using a
+// user-defined function.
+
+// An list based on a variable length object (let alone a sort!)
+// must use some form of dereferencing scheme. So, if you want to
+// use variable length records, you have to use pointers, and supply
+// functions to do everything (compare, read, write).
+//
+// On the otherhand, it would be quite more convenient (to use) if we
+// used objects (would need copy, compare, read, write).
+//
+// 1) Restrict to void*, fixed block size, functions for compare,
+// destroy. read, write and copy done with fread(), fwrite() and
+// memcpy().
+//
+// 2) Restrict to void*, functions for compare, read, write and
+// destroy. I allocate an array of pointers. Assume shallow copies
+// are ok (qsort will be used). On construct, we need to know the
+// size of the data so we know how many objects to buffer before
+// sorting and writing. It's also possible to use fread() and
+// fwrite().
+//
+// 3) Restrict to objects, operators for copy, compare, read, write,
+// default construct, destroy. I allocate an array of objects.
+//
+// 1 is the easiest to write, 2 and 3 are conceptually the same. 1
+// cannot write out deep data (pointer to string). 2 is a trivial
+// extenstion to 1, and fixes that. 3 is the correct version, but I
+// don't want to deal with streams io. So, 2 it is.
+//
+
+class bigQueue {
+public:
+ // Initialize the bigQueue for anonymous storage, with an
+ // option to later save the array.
+ //
+ bigQueue(bool (*readfcn)(FILE *, void *),
+ bool (*writfcn)(FILE *, void *),
+ void (*killfcn)(void *),
+ uint32 objectSize,
+ char *tmpPath) {
+ _initialize(0L, readfcn, writfcn, killfcn, objectSize, 0, tmpPath, 0L);
+ };
+
+
+ // Initialize the bigQueue with a file of objects, presumabely from
+ // a previous invocation of bigQueue.
+ //
+ bigQueue(bool (*readfcn)(FILE *, void *),
+ bool (*writfcn)(FILE *, void *),
+ void (*killfcn)(void *),
+ uint32 objectSize,
+ char *tmpPath,
+ char *filename) {
+ _initialize(0L, readfcn, writfcn, killfcn, objectSize, 0, tmpPath, filename);
+ };
+
+
+ // Initialize the bigQueue for sorting.
+ //
+ bigQueue(int (*sortfcn)(const void *a, const void *b),
+ bool (*readfcn)(FILE *, void *),
+ bool (*writfcn)(FILE *, void *),
+ void (*killfcn)(void *),
+ uint32 objectSize,
+ uint32 memoryToUse,
+ char *tmpPath) {
+ _initialize(sortfcn, readfcn, writfcn, killfcn, objectSize, memoryToUse, tmpPath, 0L);
+ };
+
+private:
+ void _initialize(int (*sortfcn)(const void *a, const void *b),
+ bool (*readfcn)(FILE *f, void *a),
+ bool (*writfcn)(FILE *f, void *a),
+ void (*killfcn)(void *),
+ uint32 objectSize,
+ uint32 memoryToUse,
+ char *tmppath,
+ char *filename);
+
+public:
+ ~bigQueue();
+
+ // Add elements to the end of the array.
+ void add(void *);
+
+ // We are designed for streaming access.
+ bool next(void);
+ void *get(void);
+
+ // Rewind to the start. Sortable must be sorted.
+ void rewind(void);
+
+ // Save the anonymous array into a real file.
+ void save(char *filepath);
+
+ // Sort the sortable. Flush the flushable.
+ void sort(void);
+ void flush(void);
+
+private:
+ void sortAndWriteBuffer(void);
+ void clearBuffer(void);
+ void mergeTemporaryFiles(void);
+
+ char *_saveFile;
+ char *_tmpPath;
+
+ int (*_sortFunction)(const void *a, const void *b);
+ bool (*_writFunction)(FILE *f, void *a);
+ bool (*_readFunction)(FILE *f, void *a);
+ void (*_killFunction)(void *a);
+
+ uint32 _objectSize;
+ uint32 _memoryToUse;
+
+ uint32 _maxOpenFiles;
+ uint32 _numTemporaryFiles;
+ uint32 _numMergeFiles;
+
+ // _temporaryFiles is all the opened output files. If we aren't
+ // sorting, then only the first one is opened.
+ //
+ // _inputFile is a dup of the first temporary file. If we are
+ // sorting, and you start reading before you sort, then you'll get
+ // a very short read.
+ //
+ FILE **_temporaryFiles;
+ FILE *_inputFile;
+
+ // Stores things read back from disk, for return to the user.
+ // Currently just one, but should be extended to many.
+ //
+ void *_thingBuffer;
+
+ uint32 _bufferMax;
+ uint32 _bufferLen;
+ void **_buffer;
+};
+
+
+
+#endif // BIGQUEUE_H
diff --git a/libutil/bitOperations.h b/libutil/bitOperations.h
new file mode 100644
index 0000000..c5e20d9
--- /dev/null
+++ b/libutil/bitOperations.h
@@ -0,0 +1,157 @@
+#ifndef BRI_BITS_H
+#define BRI_BITS_H
+
+// For dealing with the bits in bytes.
+
+// I wish I could claim these.
+//
+// Freed, Edwin E. 1983. "Binary Magic Number" Dr. Dobbs Journal
+// Vol. 78 (April) pp. 24-37
+//
+// Supposedly tells us how to reverse the bits in a word, count the number
+// of set bits in a words and more.
+//
+// A bit of verbage on counting the number of set bits. The naive way
+// is to loop and shift:
+//
+// uint32 r = uint32ZERO;
+// while (x) {
+// r++;
+// x >>= 1;
+// }
+// return(r);
+//
+// http://remus.rutgers.edu/~rhoads/Code/bitcount3.c has an optimized
+// method:
+//
+// x -= (0xaaaaaaaa & x) >> 1;
+// x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+// x += x >> 4;
+// x &= 0x0f0f0f0f;
+// x += x >> 8;
+// x += x >> 16;
+// x &= 0x000000ff;
+// return(x);
+//
+// No loops!
+//
+// Freed's methods are easier to understand, and just as fast.
+//
+// Using our bit counting routines, Ross Lippert suggested a nice
+// way of computing log2 -- use log2 shifts to fill up the lower
+// bits, then count bits. See logBaseTwo*()
+//
+
+
+inline
+uint32
+reverseBits32(uint32 x) {
+ x = ((x >> 1) & uint32NUMBER(0x55555555)) | ((x << 1) & uint32NUMBER(0xaaaaaaaa));
+ x = ((x >> 2) & uint32NUMBER(0x33333333)) | ((x << 2) & uint32NUMBER(0xcccccccc));
+ x = ((x >> 4) & uint32NUMBER(0x0f0f0f0f)) | ((x << 4) & uint32NUMBER(0xf0f0f0f0));
+ x = ((x >> 8) & uint32NUMBER(0x00ff00ff)) | ((x << 8) & uint32NUMBER(0xff00ff00));
+ x = ((x >> 16) & uint32NUMBER(0x0000ffff)) | ((x << 16) & uint32NUMBER(0xffff0000));
+ return(x);
+}
+
+inline
+uint64
+reverseBits64(uint64 x) {
+ x = ((x >> 1) & uint64NUMBER(0x5555555555555555)) | ((x << 1) & uint64NUMBER(0xaaaaaaaaaaaaaaaa));
+ x = ((x >> 2) & uint64NUMBER(0x3333333333333333)) | ((x << 2) & uint64NUMBER(0xcccccccccccccccc));
+ x = ((x >> 4) & uint64NUMBER(0x0f0f0f0f0f0f0f0f)) | ((x << 4) & uint64NUMBER(0xf0f0f0f0f0f0f0f0));
+ x = ((x >> 8) & uint64NUMBER(0x00ff00ff00ff00ff)) | ((x << 8) & uint64NUMBER(0xff00ff00ff00ff00));
+ x = ((x >> 16) & uint64NUMBER(0x0000ffff0000ffff)) | ((x << 16) & uint64NUMBER(0xffff0000ffff0000));
+ x = ((x >> 32) & uint64NUMBER(0x00000000ffffffff)) | ((x << 32) & uint64NUMBER(0xffffffff00000000));
+ return(x);
+}
+
+
+#if (__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+#define PREFETCH(x) __builtin_prefetch((x), 0, 0)
+#else
+#define PREFETCH(x)
+#endif
+
+
+
+
+// Amazingingly, this is slower. From what I can google, the builtin
+// is using the 2^16 lookup table method - so a 64-bit popcount does
+// 4 lookups in the table and sums. Bad cache performance in codes
+// that already have bad cache performance, I'd guess.
+//
+//#if (__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+//#define BUILTIN_POPCOUNT
+//#endif
+
+#ifdef BUILTIN_POPCOUNT
+
+inline
+uint32
+countNumberOfSetBits32(uint32 x) {
+ return(__builtin_popcount(x));
+}
+
+inline
+uint64
+countNumberOfSetBits64(uint64 x) {
+ return(__builtin_popcountll(x));
+}
+
+#else
+
+inline
+uint32
+countNumberOfSetBits32(uint32 x) {
+ x = ((x >> 1) & uint32NUMBER(0x55555555)) + (x & uint32NUMBER(0x55555555));
+ x = ((x >> 2) & uint32NUMBER(0x33333333)) + (x & uint32NUMBER(0x33333333));
+ x = ((x >> 4) & uint32NUMBER(0x0f0f0f0f)) + (x & uint32NUMBER(0x0f0f0f0f));
+ x = ((x >> 8) & uint32NUMBER(0x00ff00ff)) + (x & uint32NUMBER(0x00ff00ff));
+ x = ((x >> 16) & uint32NUMBER(0x0000ffff)) + (x & uint32NUMBER(0x0000ffff));
+ return(x);
+}
+
+inline
+uint64
+countNumberOfSetBits64(uint64 x) {
+ x = ((x >> 1) & uint64NUMBER(0x5555555555555555)) + (x & uint64NUMBER(0x5555555555555555));
+ x = ((x >> 2) & uint64NUMBER(0x3333333333333333)) + (x & uint64NUMBER(0x3333333333333333));
+ x = ((x >> 4) & uint64NUMBER(0x0f0f0f0f0f0f0f0f)) + (x & uint64NUMBER(0x0f0f0f0f0f0f0f0f));
+ x = ((x >> 8) & uint64NUMBER(0x00ff00ff00ff00ff)) + (x & uint64NUMBER(0x00ff00ff00ff00ff));
+ x = ((x >> 16) & uint64NUMBER(0x0000ffff0000ffff)) + (x & uint64NUMBER(0x0000ffff0000ffff));
+ x = ((x >> 32) & uint64NUMBER(0x00000000ffffffff)) + (x & uint64NUMBER(0x00000000ffffffff));
+ return(x);
+}
+
+#endif
+
+
+
+inline
+uint32
+logBaseTwo32(uint32 x) {
+ x |= x >> 1;
+ x |= x >> 2;
+ x |= x >> 4;
+ x |= x >> 8;
+ x |= x >> 16;
+ return(countNumberOfSetBits32(x));
+}
+
+inline
+uint64
+logBaseTwo64(uint64 x) {
+ x |= x >> 1;
+ x |= x >> 2;
+ x |= x >> 4;
+ x |= x >> 8;
+ x |= x >> 16;
+ x |= x >> 32;
+ return(countNumberOfSetBits64(x));
+}
+
+
+
+
+#endif // BRI_BITS_H
diff --git a/libutil/bitPackedArray.C b/libutil/bitPackedArray.C
new file mode 100644
index 0000000..e3318b7
--- /dev/null
+++ b/libutil/bitPackedArray.C
@@ -0,0 +1,100 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <strings.h>
+
+#include "util++.H"
+
+bitPackedArray::bitPackedArray(uint32 valueWidth, uint32 segmentSize) {
+ _valueWidth = valueWidth;
+ _segmentSize = segmentSize;
+ _nextElement = 0;
+ _valuesPerSegment = (uint64)_segmentSize * 1024 * 8 / (uint64)_valueWidth;
+
+ _numSegments = 0;
+ _maxSegments = 16;
+ _segments = new uint64 * [_maxSegments];
+}
+
+
+bitPackedArray::~bitPackedArray() {
+ for (uint32 i=0; i<_numSegments; i++)
+ delete [] _segments[i];
+ delete [] _segments;
+}
+
+
+uint64
+bitPackedArray::get(uint64 idx) {
+ uint64 s = idx / _valuesPerSegment;
+ uint64 p = _valueWidth * (idx % _valuesPerSegment);
+
+ if (idx >= _nextElement) {
+ fprintf(stderr, "bitPackedArray::get()-- element index "uint64FMT" is out of range, only "uint64FMT" elements.\n",
+ idx, _nextElement-1);
+ return(0xdeadbeefdeadbeefULL);
+ }
+
+ return(getDecodedValue(_segments[s], p, _valueWidth));
+}
+
+
+void
+bitPackedArray::set(uint64 idx, uint64 val) {
+ uint64 s = idx / _valuesPerSegment;
+ uint64 p = _valueWidth * (idx % _valuesPerSegment);
+
+ //fprintf(stderr, "s="uint64FMT" p="uint64FMT" segments="uint64FMT"/"uint64FMT"\n", s, p, _numSegments, _maxSegments);
+
+ if (idx >= _nextElement)
+ _nextElement = idx+1;
+
+ if (s >= _maxSegments) {
+ _maxSegments = s + 16;
+ uint64 **S = new uint64 * [_maxSegments];
+ for (uint32 i=0; i<_numSegments; i++)
+ S[i] = _segments[i];
+ delete [] _segments;
+ _segments = S;
+ }
+
+ while (_numSegments <= s)
+ _segments[_numSegments++] = new uint64 [_segmentSize * 1024 / 8];
+
+ setDecodedValue(_segments[s], p, _valueWidth, val);
+}
+
+
+void
+bitPackedArray::clear(void) {
+ for (uint32 s=0; s<_numSegments; s++)
+ bzero(_segments[s], _segmentSize * 1024);
+}
+
+
+////////////////////////////////////////
+
+bitArray::bitArray(uint32 segmentSize) {
+ _segmentSize = segmentSize;
+ _valuesPerSegment = (uint64)_segmentSize * 1024 * 8;
+
+ _numSegments = 0;
+ _maxSegments = 16;
+ _segments = new uint64 * [_maxSegments];
+}
+
+
+bitArray::~bitArray() {
+ for (uint32 i=0; i<_numSegments; i++)
+ delete [] _segments[i];
+ delete [] _segments;
+}
+
+
+void
+bitArray::clear(void) {
+ for (uint32 s=0; s<_numSegments; s++)
+ bzero(_segments[s], _segmentSize * 1024);
+}
diff --git a/libutil/bitPackedArray.H b/libutil/bitPackedArray.H
new file mode 100644
index 0000000..8298ee9
--- /dev/null
+++ b/libutil/bitPackedArray.H
@@ -0,0 +1,318 @@
+#ifndef BITPACKEDARRAY_H
+#define BITPACKEDARRAY_H
+
+#undef DEBUG_BPH_ADD
+#undef DEBUG_BPH_GET
+
+////////////////////////////////////////
+//
+// bitPackedArray
+//
+// implements an integer array using bit-widths less than word-sizes,
+// e.g., a memory efficient way to store 23 bit numbers. Numbers may
+// be up to 64 bits wide.
+//
+// The array is variable length, and it is implemented as an array,
+// not a list or tree -- accessing element 1,000,000 will allocate
+// elements 0 through 999,999.
+//
+class bitPackedArray {
+public:
+
+ // Create a bitpacked array with elements of width 'width' using
+ // 'segmentSize' KB per segment. If you know your array is going
+ // to be much bigger or smaller, crank this value.
+ //
+ bitPackedArray(uint32 valueWidth, uint32 segmentSize = 1024);
+ ~bitPackedArray();
+
+ // No array operator is provided, because we cannot return a
+ // reference to a value that is split across two words (or even a
+ // reference to a value that is not bit aligned in the word).
+ //
+ uint64 get(uint64 idx);
+ void set(uint64 idx, uint64 val);
+
+ // Clear the array. Since the array is variable sized, you must add
+ // things to a new array before clearing it.
+ void clear(void);
+
+private:
+ uint32 _valueWidth;
+ uint32 _segmentSize;
+ uint64 _nextElement; // the first invalid element
+ uint64 _valuesPerSegment;
+
+ uint64 _numSegments;
+ uint64 _maxSegments;
+ uint64 **_segments;
+};
+
+
+// An array of bits. Exactly the same as the bitPackedArray, but
+// optimized for width=1.
+//
+class bitArray {
+public:
+
+ bitArray(uint32 segmentSize = 1024);
+ ~bitArray();
+
+ uint64 get(uint64 idx);
+
+ uint64 getAndSet(uint64 idx);
+
+ void set(uint64 idx);
+ void clr(uint64 idx);
+
+ void clear(void);
+
+private:
+ void resize(uint64 s);
+
+ uint32 _segmentSize;
+ uint64 _valuesPerSegment;
+
+ uint64 _numSegments;
+ uint64 _maxSegments;
+ uint64 **_segments;
+};
+
+
+// Uses the bitPackedArray to implement a heap. The bitPackedArray is dynamically sized,
+// so this can be too.
+//
+class bitPackedHeap {
+public:
+ bitPackedHeap(uint32 width, uint64 size=16) {
+ _array = new bitPackedArray(width, size);
+ _array->set(0, 0);
+ _lastVal = 0;
+ };
+
+ ~bitPackedHeap() {
+ delete _array;
+ };
+
+ uint64 get(void) {
+ uint64 biggestVal = ~uint64ZERO;
+
+ if (_lastVal == 0)
+ return(biggestVal);
+
+ biggestVal = _array->get(0);
+ _lastVal--;
+
+ if (_lastVal == 0)
+ return(biggestVal);
+
+ uint64 t = _array->get(_lastVal);
+
+ _array->set(0, t);
+
+ uint64 pidx = 0;
+ uint64 pval = t;
+ uint64 cidx = 1;
+ uint64 cval = 0; // set below
+
+ while (cidx < _lastVal) {
+ // Set cval here, so we can first test if cidx is in range.
+ cval = _array->get(cidx);
+
+ // Pick the smallest of the two kids
+ if (cidx+1 < _lastVal) {
+ t = _array->get(cidx+1);
+ if (cval > t) {
+ cidx++;
+ cval = t;
+ }
+ }
+
+#ifdef DEBUG_BPH_GET
+ fprintf(stderr, "test c="uint64FMT" and p="uint64FMT" lastVal="uint64FMT"\n",
+ cidx, pidx, _lastVal);
+ fprintf(stderr, "test c="uint64FMT"="uint64FMT"\n",
+ cidx, cval);
+ fprintf(stderr, "test p="uint64FMT"="uint64FMT"\n",
+ pidx, pval);
+ fprintf(stderr, "test c="uint64FMT"="uint64FMT" and p="uint64FMT"="uint64FMT"\n",
+ cidx, cval, pidx, pval);
+#endif
+
+ if (cval < pval) {
+
+#ifdef DEBUG_BPH_GET
+ fprintf(stderr, "swap c="uint64FMT"="uint64FMT" and p="uint64FMT"="uint64FMT"\n",
+ cidx, cval, pidx, pval);
+#endif
+
+ // Swap p and c
+ _array->set(pidx, cval);
+ _array->set(cidx, pval);
+
+ // Move down the tree -- pval doesn't change, we moved it into cidx!
+ pidx = cidx;
+ cidx = cidx * 2 + 1;
+ } else {
+ cidx = _lastVal;
+ }
+ }
+
+ return(biggestVal);
+ };
+
+ void add(uint64 value) {
+ uint64 cidx = _lastVal;
+ uint64 cval = value;
+ uint64 pidx = 0;
+ uint64 pval = 0;
+ bool more = false;
+
+#ifdef DEBUG_BPH_ADD
+ fprintf(stderr, "add c="uint64FMT"="uint64FMT" -- lastVal="uint64FMT"\n",
+ cidx, cval, _lastVal);
+#endif
+
+ _array->set(cidx, cval);
+
+ if (cidx > 0)
+ more = true;
+
+ while (more) {
+ pidx = (cidx-1) / 2;
+
+#ifdef DEBUG_BPH_ADD
+ fprintf(stderr, "more c="uint64FMT" and p="uint64FMT"\n", cidx, pidx);
+#endif
+
+ pval = _array->get(pidx);
+
+#ifdef DEBUG_BPH_ADD
+ fprintf(stderr, "test c="uint64FMT"="uint64FMT" and p="uint64FMT"="uint64FMT"\n",
+ cidx, cval, pidx, pval);
+#endif
+
+ if (pval > cval) {
+
+#ifdef DEBUG_BPH_ADD
+ fprintf(stderr, "swap c="uint64FMT"="uint64FMT" and p="uint64FMT"="uint64FMT"\n",
+ cidx, cval, pidx, pval);
+#endif
+
+ // Swap p and c
+ _array->set(cidx, pval);
+ _array->set(pidx, cval);
+
+ // Move up the tree -- cval doesn't change, we moved it into pidx!
+ cidx = pidx;
+ } else {
+ more = false;
+ }
+ if (cidx == 0)
+ more = false;
+ }
+
+ _lastVal++;
+
+ //dump();
+ };
+
+ void dump(void) {
+ for (uint32 i=0; i<_lastVal; i++)
+ fprintf(stderr, "HEAP["uint32FMT"]="uint64FMT"\n", i, _array->get(i));
+ }
+
+ void clear(void) {
+ _array->clear();
+ _lastVal = 0;
+ };
+
+private:
+ bitPackedArray *_array;
+ uint64 _lastVal;
+};
+
+
+
+inline
+uint64
+bitArray::get(uint64 idx) {
+ uint64 s = idx / _valuesPerSegment;
+ uint64 p = idx % _valuesPerSegment;
+
+ uint64 wrd = (p >> 6) & 0x0000cfffffffffffllu;
+ uint64 bit = (p ) & 0x000000000000003fllu;
+
+ return((_segments[s][wrd] >> bit) & 0x0000000000000001llu);
+}
+
+
+inline
+void
+bitArray::resize(uint64 s) {
+
+ if (s < _numSegments)
+ return;
+
+ if (s > _maxSegments) {
+ _maxSegments = s + 16;
+ uint64 **S = new uint64 * [_maxSegments];
+ for (uint32 i=0; i<_numSegments; i++)
+ S[i] = _segments[i];
+ delete [] _segments;
+ _segments = S;
+ }
+
+ while (_numSegments <= s)
+ _segments[_numSegments++] = new uint64 [_segmentSize * 1024 / 8];
+}
+
+
+inline
+uint64
+bitArray::getAndSet(uint64 idx) {
+ uint64 s = idx / _valuesPerSegment;
+ uint64 p = idx % _valuesPerSegment;
+
+ uint64 wrd = (p >> 6) & 0x0000cfffffffffffllu;
+ uint64 bit = (p ) & 0x000000000000003fllu;
+
+ uint64 ret = (_segments[s][wrd] >> bit) & 0x0000000000000001llu;
+
+ _segments[s][wrd] |= uint64ONE << bit;
+
+ return(ret);
+}
+
+
+inline
+void
+bitArray::set(uint64 idx) {
+ uint64 s = idx / _valuesPerSegment;
+ uint64 p = idx % _valuesPerSegment;
+
+ resize(s);
+
+ uint64 wrd = (p >> 6) & 0x0000cfffffffffffllu;
+ uint64 bit = (p ) & 0x000000000000003fllu;
+
+ _segments[s][wrd] |= uint64ONE << bit;
+}
+
+
+inline
+void
+bitArray::clr(uint64 idx) {
+ uint64 s = idx / _valuesPerSegment;
+ uint64 p = idx % _valuesPerSegment;
+
+ resize(s);
+
+ uint64 wrd = (p >> 6) & 0x0000cfffffffffffllu;
+ uint64 bit = (p ) & 0x000000000000003fllu;
+
+ _segments[s][wrd] &= ~(0x0000000000000001llu << bit);
+}
+
+
+#endif // BITPACKEDARRAY_H
diff --git a/libutil/bitPackedFile.C b/libutil/bitPackedFile.C
new file mode 100644
index 0000000..278cc56
--- /dev/null
+++ b/libutil/bitPackedFile.C
@@ -0,0 +1,473 @@
+#include "util++.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+
+// N.B. any read() / write() pair (either order) must have a seek (or
+// a fflush) in between.
+
+bitPackedFile::bitPackedFile(char const *name, uint64 offset, bool forceTruncate) {
+
+ _file = 0;
+ _name = new char [strlen(name) + 1];
+ strcpy(_name, name);
+
+#ifdef WITH_BZIP2
+ _bzFILE = 0L;
+ _bzerr = 0;
+ _bzfile = 0L;
+#endif
+
+ _bfrmax = 1048576 / 8;
+ _bfr = new uint64 [_bfrmax];
+ _pos = uint64ZERO;
+ _bit = uint64ZERO;
+
+ memset(_bfr, 0, sizeof(uint64) * _bfrmax);
+
+ _inCore = false;
+ _bfrDirty = false;
+ _forceFirstLoad = false;
+ _isReadOnly = false;
+ _isBzip2 = false;
+
+ stat_seekInside = uint64ZERO;
+ stat_seekOutside = uint64ZERO;
+ stat_dirtyFlushes = uint64ZERO;
+
+ file_offset = 0;
+ endianess_offset = 0;
+ endianess_flipped = false;
+
+
+ // Try to open the original name -- we don't support compressed
+ // files for rewrite. We just fail with a can't open message.
+ //
+ // To get read/write and create we have to use open(2), as mode
+ // "r+" of fopen(3) will not create. (Yes, but w+ does, sigh.)
+ //
+ if (forceTruncate) {
+ errno = 0;
+ _file = open(_name,
+ O_RDWR | O_CREAT | O_TRUNC | O_LARGEFILE,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno)
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- failed to open and truncate '%s': %s\n",
+ _name, strerror(errno)), exit(1);
+ } else if (fileExists(_name)) {
+ errno = 0;
+ _file = open(_name,
+ O_RDONLY | O_LARGEFILE,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno)
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- failed to open '%s': %s\n",
+ _name, strerror(errno)), exit(1);
+ _isReadOnly = true;
+ } else {
+ errno = 0;
+ _file = open(_name,
+ O_RDWR | O_CREAT | O_LARGEFILE,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno)
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- failed to open '%s': %s\n",
+ _name, strerror(errno)), exit(1);
+ }
+
+ // Move to the correct position in the file.
+ //
+ file_offset = offset;
+ if (file_offset > 0)
+ lseek(_file, file_offset, SEEK_SET);
+
+ // Deal with endianess. We write out some bytes (or read back some bytes) to the start of
+ // the file, and then hide them from the user.
+ //
+ endianess_offset = 32 + file_offset;
+ endianess_flipped = false;
+
+ char t[16] = { 'b', 'i', 't', 'P', 'a', 'c', 'k', 'e', 'd', 'F', 'i', 'l', 'e', 0, 0, 1 };
+ char c[16] = { 0 };
+ uint64 at = uint64NUMBER(0xdeadbeeffeeddada );
+ uint64 bt = uint64NUMBER(0x0abeadedbabed8f8);
+ uint64 ac = uint64NUMBER(0);
+ uint64 bc = uint64NUMBER(0);
+ size_t nr = 0;
+
+ errno = 0;
+ nr += read(_file, c, sizeof(char) * 16);
+ nr += read(_file, &ac, sizeof(uint64));
+ nr += read(_file, &bc, sizeof(uint64));
+
+ if (nr == 0) {
+ // Empty file! Write the magic number and our endianess check.
+
+ errno = 0;
+ write(_file, t, sizeof(char) * 16);
+ write(_file, &at, sizeof(uint64));
+ write(_file, &bt, sizeof(uint64));
+ if (errno)
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- '%s' failed to write the header: %s\n", _name, strerror(errno)), exit(1);
+
+ return;
+ }
+
+
+ if ((c[0] == 'B') && (c[1] == 'Z') && (c[2] == 'h')) {
+#ifdef WITH_BZIP2
+ // Looks like a bzip2 file!
+
+ errno = 0;
+ _bzFILE = fopen(_name, "r");
+ if (errno) {
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- failed to open bzip2 file '%s'\n", _name);
+ exit(1);
+ }
+
+ _bzerr = 0;
+ _bzfile = BZ2_bzReadOpen(&_bzerr, _bzFILE, 0, 0, 0L, 0);
+ if ((_bzfile == 0L) || (_bzerr != BZ_OK)) {
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- failed to init bzip2 file '%s'\n", _name);
+ exit(1);
+ }
+
+ BZ2_bzRead(&_bzerr, _bzfile, c, sizeof(char) * 16);
+ BZ2_bzRead(&_bzerr, _bzfile, &ac, sizeof(uint64));
+ BZ2_bzRead(&_bzerr, _bzfile, &bc, sizeof(uint64));
+
+ // XXX should check bzerr!
+
+ _isReadOnly = true;
+ _isBzip2 = true;
+#else
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- '%s' looks like a bzip2 file, but bzip2 support not available!\n", _name);
+ exit(1);
+#endif
+ }
+
+
+ // Check the magic number, decide on an endianess to use.
+ //
+ if (strncmp(t, c, 16) == 0) {
+ if ((at == ac) && (bt == bc)) {
+ endianess_flipped = false;
+ } else if ((at == uint64Swap(ac)) && (bt == uint64Swap(bc))) {
+ endianess_flipped = true;
+ } else {
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- '%s' looked like a bitPackedFile, but failed the endianess check, not opened.\n", _name);
+ exit(1);
+ }
+ } else {
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- '%s' doesn't appear to be a bitPackedFile, not opened.\n", _name);
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- found ");
+ for (uint32 i=0; i<16; i++)
+ fprintf(stderr, "%c", isascii(c[i]) ? c[i] : '.');
+ fprintf(stderr, " at position "uint64HEX"\n", file_offset);
+ exit(1);
+ }
+
+ _forceFirstLoad = true;
+ seek(0);
+}
+
+
+bitPackedFile::~bitPackedFile() {
+ flushDirty();
+ delete [] _bfr;
+ delete [] _name;
+ close(_file);
+
+#ifdef WITH_BZIP2
+ if (_bzFILE)
+ fclose(_bzFILE);
+
+ if (_bzfile)
+ BZ2_bzReadClose(&_bzerr, _bzfile);
+#endif
+}
+
+
+
+// If the page is dirty, flush it to disk
+//
+void
+bitPackedFile::flushDirty(void) {
+
+ if (_bfrDirty == false)
+ return;
+
+ if (_isReadOnly) {
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- '%s' is readonly, but is dirty!\n", _name);
+ exit(1);
+ }
+
+ stat_dirtyFlushes++;
+
+ errno = 0;
+ lseek(_file, _pos * sizeof(uint64) + endianess_offset, SEEK_SET);
+ if (errno) {
+ fprintf(stderr, "bitPackedFile::seek()-- '%s' failed: %s\n",
+ _name, strerror(errno));
+ exit(1);
+ }
+
+ // If we need to, flip all the words we are going to write
+ //
+ if (endianess_flipped)
+ for (uint32 i=0; i<_bfrmax; i++)
+ _bfr[i] = uint64Swap(_bfr[i]);
+
+ // We should only write bits up to _bit, the position we are
+ // currently at. However, we don't know if the block is being
+ // flushed because we're totally finished with it, or because we
+ // are moving on to the next block. If we're done with it, we
+ // want to flush the word that contains _bit, and if we're moving
+ // on to the next one, we'll flush that word again. So, in
+ // either case, we flush the word that contains _bit.
+ //
+ errno = 0;
+ write(_file, _bfr, sizeof(uint64) * _bfrmax);
+ if (errno) {
+ fprintf(stderr, "bitPackedFile::write()-- '%s' failed: %s\n",
+ _name, strerror(errno));
+ exit(1);
+ }
+
+ // And then flip them back
+ //
+ if (endianess_flipped)
+ for (uint32 i=0; i<_bfrmax; i++)
+ _bfr[i] = uint64Swap(_bfr[i]);
+
+ _bfrDirty = false;
+}
+
+
+
+void
+bitPackedFile::seekBzip2(uint64 bitpos) {
+
+#ifdef WITH_BZIP2
+ // All we can do here is check that bitpos is
+ // a) in our current buffer
+ // b) would be in the next buffer once we read it
+
+ uint64 newpos = bitpos >> 6;
+
+ if (_pos + _bfrmax < newpos) {
+ // nope, not in the buffer -- we could probably handle this by just reading and
+ // discarding from the file until we get to the correct bitpos.
+ fprintf(stderr, "bitPackedFile::seekBzip2()-- '%s' seek was not contiguous!\n", _name);
+ exit(1);
+ }
+
+ // Copy the remaining bits of the current buffer to the start. Or
+ // not, if this is the first load.
+
+ uint64 lastpos = _bit >> 6; // The word we are currently in
+ uint64 lastlen = (_bfrmax - lastpos); // The number of words left in the buffer
+
+ if (_forceFirstLoad == true) {
+ lastpos = 0;
+ lastlen = 0;
+ } else {
+ memcpy(_bfr, _bfr + lastpos, sizeof(uint64) * lastlen);
+ }
+
+ // Update _bit and _pos -- lastlen is now the first invalid word
+ //
+ _bit = bitpos & 0x3f; // 64 * lastlen;
+ _pos = bitpos >> 6;
+
+ // Fill the buffer
+
+ size_t wordsread = 0;
+
+ if (_bzfile) {
+ _bzerr = 0;
+ wordsread = BZ2_bzRead(&_bzerr, _bzfile, _bfr + lastlen, sizeof(uint64) * (_bfrmax - lastlen));
+ if (_bzerr == BZ_STREAM_END) {
+ //fprintf(stderr, "bitPackedFile::seekBzip2() file ended.\n");
+ BZ2_bzReadClose(&_bzerr, _bzfile);
+ fclose(_bzFILE);
+ _bzfile = 0L;
+ _bzFILE = 0L;
+ } else if (_bzerr != BZ_OK) {
+ fprintf(stderr, "bitPackedFile::seekBzip2() '%s' read failed.\n", _name);
+ exit(1);
+ }
+ }
+
+ //fprintf(stderr, "Filled buffer with %d words!\n", wordsread);
+
+ // Adjust to make wordsread be the index of the last word we actually read.
+ //
+ wordsread += lastlen;
+
+ // Flip all the words we just read, if needed
+ //
+ if (endianess_flipped)
+ for (uint32 i=lastlen; i<wordsread; i++)
+ _bfr[i] = uint64Swap(_bfr[i]);
+
+ // Clear any words that we didn't read (supposedly, because we hit
+ // EOF).
+ //
+ while (wordsread < _bfrmax)
+ _bfr[wordsread++] = uint64ZERO;
+#else
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- '%s'\n", _name);
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- bzip2 support not present, but still tried to read it??\n");
+ exit(1);
+#endif
+}
+
+
+
+void
+bitPackedFile::seekNormal(uint64 bitpos) {
+
+ if (_inCore) {
+ fprintf(stderr, "bitPackedFile::bitPackedFile()-- '%s' is in core, but still needed to seek??\n",
+ _name);
+ exit(1);
+ }
+
+ // Somewhat of a gross hack to allow sequential access backwards.
+ //
+ // If the new position (bitpos >> 6) is just before the old
+ // position (_pos), assume that we are being accessed iteratively
+ // backwards and load a full buffer so that the position we want to
+ // access is at the end.
+ //
+ // Easy to think of bone-headed ways to break this (e.g., seek to
+ // the second element in a structure, access the first, then access
+ // the third). Not so easy to think of a logical reason someone
+ // would want to do that.
+ //
+ if (((bitpos >> 6) < _pos) && (_pos <= (bitpos >> 6) + 32)) {
+ _pos = bitpos >> 6;
+ if (_pos > _bfrmax)
+ _pos = _pos - _bfrmax + 32;
+ else
+ _pos = 0;
+ } else {
+ _pos = bitpos >> 6;
+ }
+
+ _bit = bitpos - (_pos << 6);
+
+
+ errno = 0;
+ lseek(_file, _pos * 8 + endianess_offset, SEEK_SET);
+ if (errno) {
+ fprintf(stderr, "bitPackedFile::seekNormal() '%s' seek to pos="uint64FMT" failed: %s\n",
+ _name,
+ _pos * 8 + endianess_offset, strerror(errno));
+ exit(1);
+ }
+
+ errno = 0;
+ size_t wordsread = read(_file, _bfr, sizeof(uint64) * _bfrmax);
+ if (errno) {
+ fprintf(stderr, "bitPackedFile::seekNormal() '%s' read of "uint64FMT" bytes failed': %s\n",
+ _name,
+ sizeof(uint64) * _bfrmax,
+ strerror(errno));
+ exit(1);
+ }
+
+ // Flip all the words we just read, if needed
+ //
+ if (endianess_flipped)
+ for (uint32 i=0; i<wordsread; i++)
+ _bfr[i] = uint64Swap(_bfr[i]);
+
+ // Clear any words that we didn't read (supposedly, because we hit
+ // EOF).
+ //
+ while (wordsread < _bfrmax)
+ _bfr[wordsread++] = uint64ZERO;
+}
+
+
+
+
+
+// Seeks to bitposition pos in the file, reads in a new block.
+//
+void
+bitPackedFile::seek(uint64 bitpos) {
+
+ // If we are seeking to somewhere in the current block, don't do a
+ // real seek, just move our position within the block.
+ //
+ if (_forceFirstLoad == false) {
+ uint64 np = bitpos >> 6;
+
+ if ((_pos <= np) && (np <= _pos + _bfrmax - 32)) {
+ _bit = bitpos - (_pos << 6);
+ stat_seekInside++;
+ //fprintf(stderr, "SEEK INSIDE to _bit="uint64FMT"\n", _bit);
+ return;
+ }
+ }
+
+ if (_inCore) {
+ fprintf(stderr, "bitPackedFile::seek()-- file '%s' is in core, but still needed to seek??\n",
+ _name);
+ exit(1);
+ }
+
+ stat_seekOutside++;
+
+ flushDirty();
+
+ if (_isBzip2)
+ seekBzip2(bitpos);
+ else
+ seekNormal(bitpos);
+
+ _forceFirstLoad = false;
+
+ //fprintf(stderr, "SEEK OUTSIDE to _pos="uint64FMT" _bit="uint64FMT"\n", _pos, _bit);
+}
+
+
+
+
+uint64
+bitPackedFile::loadInCore(void) {
+ struct stat sb;
+
+ // Convert this disk-based, read/write bitPackedFile to memory-based read-only.
+
+ flushDirty();
+
+ fstat(_file, &sb);
+
+ // The extra 1024 words is to keep seek() from attempting to grab
+ // the next block (there isn't a next block, we've got it all!)
+ // when we're near the end of this block. We just make the block
+ // a little bigger than it really is.
+
+ delete [] _bfr;
+
+ _bfrmax = sb.st_size / 8 + 1024;
+ _bfr = new uint64 [_bfrmax];
+ _pos = 0;
+ _bit = 0;
+
+ // Tada! All we need to do now is load the block!
+
+ _forceFirstLoad = true;
+
+ seek(0);
+
+ _inCore = true;
+
+ return(_bfrmax * 8);
+}
diff --git a/libutil/bitPackedFile.H b/libutil/bitPackedFile.H
new file mode 100644
index 0000000..2d0d614
--- /dev/null
+++ b/libutil/bitPackedFile.H
@@ -0,0 +1,127 @@
+#ifndef BITPACKEDFILE_H
+#define BITPACKEDFILE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "util.h"
+
+//#define WITH_BZIP2
+
+#ifdef WITH_BZIP2
+#include <bzlib.h>
+#endif
+
+class bitPackedFile {
+public:
+ bitPackedFile(char const *name, uint64 offset=0, bool forceTruncate=false);
+ ~bitPackedFile();
+
+ uint64 getBits(uint32 size);
+ uint64 getNumber(void);
+
+ void putBits(uint64 bits, uint32 size);
+ void putNumber(uint64 val);
+
+ uint64 tell(void) { return((_pos << 6) + _bit); };
+ void seek(uint64 pos);
+
+ uint64 loadInCore(void);
+
+ void showStats(FILE *f) {
+ fprintf(f, "inside: "uint64FMT" outside: "uint64FMT"\n", stat_seekInside, stat_seekOutside);
+ fflush(f);
+ };
+private:
+
+ // Ensure that the buffer has enough space for any future
+ // operation. This constant, currently 31 bytes, must be strictly
+ // less than the constant used in deciding if seek() is moving
+ // forward or backwards.
+ //
+ void sync(void) {
+ if (((_bit >> 6) + 31) >= _bfrmax)
+ seek((_pos << 6) + _bit);
+ };
+
+ void flushDirty(void);
+ void seekBzip2(uint64 bitpos);
+ void seekNormal(uint64 bitpos);
+
+ int _file;
+ char *_name;
+
+#ifdef WITH_BZIP2
+ FILE *_bzFILE;
+ int _bzerr;
+ BZFILE *_bzfile;
+#endif
+
+ uint64 _bfrmax; // Number of words in the buffer
+ uint64 *_bfr; // A chunk of the bitPackedFile in core
+ uint64 _pos; // The location this chunk is from (in words)
+ uint64 _bit; // The bit position we are modifying relative to _pos
+
+ bool _inCore;
+ bool _bfrDirty;
+ bool _forceFirstLoad;
+ bool _isReadOnly;
+ bool _isBzip2;
+
+ // For collecting statistics on our usage
+ //
+ uint64 stat_seekInside;
+ uint64 stat_seekOutside;
+ uint64 stat_dirtyFlushes;
+
+ // For converting between hardware of different endianess.
+ //
+ uint64 file_offset;
+ uint64 endianess_offset;
+ bool endianess_flipped;
+};
+
+
+inline
+uint64
+bitPackedFile::getBits(uint32 siz) {
+ sync();
+ uint64 ret = getDecodedValue(_bfr, _bit, siz);
+ _bit += siz;
+ return(ret);
+}
+
+inline
+uint64
+bitPackedFile::getNumber(void) {
+ sync();
+ uint64 siz = 0;
+ uint64 ret = getFibonacciEncodedNumber(_bfr, _bit, &siz);
+ _bit += siz;
+ return(ret);
+}
+
+
+inline
+void
+bitPackedFile::putBits(uint64 bits, uint32 siz) {
+ assert(_isReadOnly == false);
+ sync();
+ setDecodedValue(_bfr, _bit, siz, bits);
+ _bit += siz;
+ _bfrDirty = true;
+}
+
+inline
+void
+bitPackedFile::putNumber(uint64 val) {
+ assert(_isReadOnly == false);
+ sync();
+ uint64 siz = 0;
+ setFibonacciEncodedNumber(_bfr, _bit, &siz, val);
+ _bit += siz;
+ _bfrDirty = true;
+}
+
+
+#endif // BITPACKEDFILE_H
diff --git a/libutil/bitPacking.h b/libutil/bitPacking.h
new file mode 100644
index 0000000..e1e1752
--- /dev/null
+++ b/libutil/bitPacking.h
@@ -0,0 +1,510 @@
+#ifndef BRI_BITPACKING_H
+#define BRI_BITPACKING_H
+
+#include <stdio.h>
+#include <assert.h>
+
+// Routines used for stuffing bits into a word array.
+
+// Define this to enable testing that the width of the data element
+// is greater than zero. The uint64MASK() macro (bri.h) does not
+// generate a mask for 0. Compiler warnings are issued, because you
+// shouldn't use this in production code.
+//
+//#define CHECK_WIDTH
+
+// As CHECK_WIDTH is kind of expensive, we'll warn.
+#ifdef CHECK_WIDTH
+#warning libutil/bitPacking.h defined CHECK_WIDTH
+#endif
+
+// Returns 'siz' bits from the stream based at 'ptr' and currently at
+// location 'pos'. The position of the stream is not changed.
+//
+// Retrieves a collection of values; the number of bits advanced in
+// the stream is returned.
+//
+// Copies the lowest 'siz' bits in 'val' to the stream based at 'ptr'
+// and currently at 'pos'. The position of the stream is not
+// changed.
+//
+// Sets a collection of values; the number of bits advanced in the
+// stream is returned.
+//
+uint64 getDecodedValue (uint64 *ptr, uint64 pos, uint64 siz);
+uint64 getDecodedValues(uint64 *ptr, uint64 pos, uint64 num, uint64 *sizs, uint64 *vals);
+void setDecodedValue (uint64 *ptr, uint64 pos, uint64 siz, uint64 val);
+uint64 setDecodedValues(uint64 *ptr, uint64 pos, uint64 num, uint64 *sizs, uint64 *vals);
+
+
+// Like getDecodedValue() but will pre/post increment/decrement the
+// value stored in the stream before in addition to returning the
+// value.
+//
+// preIncrementDecodedValue(ptr, pos, siz) === x = getDecodedValue(ptr, pos, siz) + 1;
+// setDecodedValue(ptr, pos, siz, x);
+//
+// preDecrementDecodedValue(ptr, pos, siz) === x = getDecodedValue(ptr, pos, siz) - 1;
+// setDecodedValue(ptr, pos, siz, x);
+//
+// postIncrementDecodedValue(ptr, pos, siz) === x = getDecodedValue(ptr, pos, siz);
+// setDecodedValue(ptr, pos, siz, x + 1);
+//
+// postDecrementDecodedValue(ptr, pos, siz) === x = getDecodedValue(ptr, pos, siz);
+// setDecodedValue(ptr, pos, siz, x - 1);
+//
+uint64 preIncrementDecodedValue(uint64 *ptr, uint64 pos, uint64 siz);
+uint64 preDecrementDecodedValue(uint64 *ptr, uint64 pos, uint64 siz);
+uint64 postIncrementDecodedValue(uint64 *ptr, uint64 pos, uint64 siz);
+uint64 postDecrementDecodedValue(uint64 *ptr, uint64 pos, uint64 siz);
+
+
+
+// N.B. - I assume the bits in words are big-endian, which is
+// backwards from the way we shift things around.
+//
+// I define the "addresses" of bits in two consectuve words as
+// [0123][0123]. When adding words to the bit array, they're added
+// from left to right:
+//
+// setDecodedValue(bitstream, %0abc, 3)
+// setDecodedValue(bitstream, %0def, 3)
+//
+// results in [abcd][ef00]
+//
+// But when shifting things around, we typically do it from the right
+// side, since that is where the machine places numbers.
+//
+// A picture or two might help.
+//
+//
+// |----b1-----|
+// |-bit-||-sz-|
+// XXXXXX
+// [0---------------63]
+// ^
+// pos
+//
+//
+// If the bits span two words, it'll look like this; b1 is smaller
+// than siz, and we update bit to be the "uncovered" piece of XXX
+// (all the stuff in word2). The first word is masked, then those
+// bits are shifted onto the result in the correct place. The second
+// word has the correct bits shifted to the right, then those are
+// appended to the result.
+//
+// |b1-|
+// |-----bit-----||---sz---|
+// XXXXXXXXXX
+// [0------------word1][0-------------word2]
+// ^
+// pos
+//
+
+
+inline
+uint64
+getDecodedValue(uint64 *ptr,
+ uint64 pos,
+ uint64 siz) {
+ uint64 wrd = (pos >> 6) & 0x0000cfffffffffffllu;
+ //PREFETCH(ptr + wrd); makes it worse
+ uint64 bit = (pos ) & 0x000000000000003fllu;
+ uint64 b1 = 64 - bit;
+ uint64 ret = 0;
+
+#ifdef CHECK_WIDTH
+ if (siz == 0) {
+ fprintf(stderr, "ERROR: getDecodedValue() called with zero size!\n");
+ abort();
+ }
+ if (siz > 64) {
+ fprintf(stderr, "ERROR: getDecodedValue() called with huge size ("uint64FMT")!\n", siz);
+ abort();
+ }
+#endif
+
+ if (b1 >= siz) {
+ ret = ptr[wrd] >> (b1 - siz);
+ } else {
+ bit = siz - b1;
+ ret = (ptr[wrd] & uint64MASK(b1)) << bit;
+ wrd++;
+ ret |= (ptr[wrd] >> (64 - bit)) & uint64MASK(bit);
+ }
+
+ ret &= uint64MASK(siz);
+
+ return(ret);
+}
+
+
+inline
+void
+setDecodedValue(uint64 *ptr,
+ uint64 pos,
+ uint64 siz,
+ uint64 val) {
+ uint64 wrd = (pos >> 6) & 0x0000cfffffffffffllu;
+ uint64 bit = (pos ) & 0x000000000000003fllu;
+ uint64 b1 = 64 - bit;
+
+#ifdef CHECK_WIDTH
+ if (siz == 0) {
+ fprintf(stderr, "ERROR: setDecodedValue() called with zero size!\n");
+ abort();
+ }
+ if (siz > 64) {
+ fprintf(stderr, "ERROR: getDecodedValue() called with huge size ("uint64FMT")!\n", siz);
+ abort();
+ }
+#endif
+
+ val &= uint64MASK(siz);
+
+ if (b1 >= siz) {
+ ptr[wrd] &= ~( uint64MASK(siz) << (b1 - siz) );
+ ptr[wrd] |= val << (b1 - siz);
+ } else {
+ bit = siz - b1;
+ ptr[wrd] &= ~uint64MASK(b1);
+ ptr[wrd] |= (val & (uint64MASK(b1) << (bit))) >> (bit);
+ wrd++;
+ ptr[wrd] &= ~(uint64MASK(bit) << (64 - bit));
+ ptr[wrd] |= (val & (uint64MASK(bit))) << (64 - bit);
+ }
+}
+
+
+inline
+uint64
+getDecodedValues(uint64 *ptr,
+ uint64 pos,
+ uint64 num,
+ uint64 *sizs,
+ uint64 *vals) {
+
+ // compute the location of the start of the encoded words, then
+ // just walk through to get the remaining words.
+
+ uint64 wrd = (pos >> 6) & 0x0000cfffffffffffllu;
+ //PREFETCH(ptr + wrd); makes it worse
+ uint64 bit = (pos ) & 0x000000000000003fllu;
+ uint64 b1 = 0;
+
+ for (uint64 i=0; i<num; i++) {
+ b1 = 64 - bit;
+
+#ifdef CHECK_WIDTH
+ if (siz[i] == 0) {
+ fprintf(stderr, "ERROR: postDecrementDecodedValue() called with zero size!\n");
+ abort();
+ }
+ if (siz[i] > 64) {
+ fprintf(stderr, "ERROR: getDecodedValue() called with huge size ("uint64FMT")!\n", siz);
+ abort();
+ }
+#endif
+
+ if (b1 >= sizs[i]) {
+ //fprintf(stderr, "get-single pos=%d b1=%d bit=%d wrd=%d\n", pos, b1, bit, wrd);
+ vals[i] = ptr[wrd] >> (b1 - sizs[i]);
+ bit += sizs[i];
+ } else {
+ //fprintf(stderr, "get-double pos=%d b1=%d bit=%d wrd=%d bitafter=%d\n", pos, b1, bit, wrd, sizs[i]-b1);
+ bit = sizs[i] - b1;
+ vals[i] = (ptr[wrd] & uint64MASK(b1)) << bit;
+ wrd++;
+ vals[i] |= (ptr[wrd] >> (64 - bit)) & uint64MASK(bit);
+ }
+
+ if (bit == 64) {
+ wrd++;
+ bit = 0;
+ }
+
+ assert(bit < 64);
+
+ vals[i] &= uint64MASK(sizs[i]);
+ pos += sizs[i];
+ }
+
+ return(pos);
+}
+
+
+inline
+uint64
+setDecodedValues(uint64 *ptr,
+ uint64 pos,
+ uint64 num,
+ uint64 *sizs,
+ uint64 *vals) {
+ uint64 wrd = (pos >> 6) & 0x0000cfffffffffffllu;
+ uint64 bit = (pos ) & 0x000000000000003fllu;
+ uint64 b1 = 0;
+
+ for (uint64 i=0; i<num; i++) {
+ vals[i] &= uint64MASK(sizs[i]);
+
+ b1 = 64 - bit;
+
+#ifdef CHECK_WIDTH
+ if (siz[i] == 0) {
+ fprintf(stderr, "ERROR: postDecrementDecodedValue() called with zero size!\n");
+ abort();
+ }
+ if (siz[i] > 64) {
+ fprintf(stderr, "ERROR: getDecodedValue() called with huge size ("uint64FMT")!\n", siz);
+ abort();
+ }
+#endif
+
+ if (b1 >= sizs[i]) {
+ //fprintf(stderr, "set-single pos=%d b1=%d bit=%d wrd=%d\n", pos, b1, bit, wrd);
+ ptr[wrd] &= ~( uint64MASK(sizs[i]) << (b1 - sizs[i]) );
+ ptr[wrd] |= vals[i] << (b1 - sizs[i]);
+ bit += sizs[i];
+ } else {
+ //fprintf(stderr, "set-double pos=%d b1=%d bit=%d wrd=%d bitafter=%d\n", pos, b1, bit, wrd, sizs[i]-b1);
+ bit = sizs[i] - b1;
+ ptr[wrd] &= ~uint64MASK(b1);
+ ptr[wrd] |= (vals[i] & (uint64MASK(b1) << (bit))) >> (bit);
+ wrd++;
+ ptr[wrd] &= ~(uint64MASK(bit) << (64 - bit));
+ ptr[wrd] |= (vals[i] & (uint64MASK(bit))) << (64 - bit);
+ }
+
+ if (bit == 64) {
+ wrd++;
+ bit = 0;
+ }
+
+ assert(bit < 64);
+
+ pos += sizs[i];
+ }
+
+ return(pos);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+inline
+uint64
+preIncrementDecodedValue(uint64 *ptr,
+ uint64 pos,
+ uint64 siz) {
+ uint64 wrd = (pos >> 6) & 0x0000cfffffffffffllu;
+ uint64 bit = (pos ) & 0x000000000000003fllu;
+ uint64 b1 = 64 - bit;
+ uint64 ret = 0;
+
+#ifdef CHECK_WIDTH
+ if (siz == 0) {
+ fprintf(stderr, "ERROR: preIncrementDecodedValue() called with zero size!\n");
+ abort();
+ }
+ if (siz > 64) {
+ fprintf(stderr, "ERROR: getDecodedValue() called with huge size ("uint64FMT")!\n", siz);
+ abort();
+ }
+#endif
+
+ if (b1 >= siz) {
+ ret = ptr[wrd] >> (b1 - siz);
+
+ ret++;
+ ret &= uint64MASK(siz);
+
+ ptr[wrd] &= ~( uint64MASK(siz) << (b1 - siz) );
+ ptr[wrd] |= ret << (b1 - siz);
+ } else {
+ bit = siz - b1;
+
+ ret = (ptr[wrd] & uint64MASK(b1)) << bit;
+ ret |= (ptr[wrd+1] >> (64 - bit)) & uint64MASK(bit);
+
+ ret++;
+ ret &= uint64MASK(siz);
+
+ ptr[wrd] &= ~uint64MASK(b1);
+ ptr[wrd] |= (ret & (uint64MASK(b1) << (bit))) >> (bit);
+ wrd++;
+ ptr[wrd] &= ~(uint64MASK(bit) << (64 - bit));
+ ptr[wrd] |= (ret & (uint64MASK(bit))) << (64 - bit);
+ }
+
+ return(ret);
+}
+
+
+
+inline
+uint64
+preDecrementDecodedValue(uint64 *ptr,
+ uint64 pos,
+ uint64 siz) {
+ uint64 wrd = (pos >> 6) & 0x0000cfffffffffffllu;
+ uint64 bit = (pos ) & 0x000000000000003fllu;
+ uint64 b1 = 64 - bit;
+ uint64 ret = 0;
+
+#ifdef CHECK_WIDTH
+ if (siz == 0) {
+ fprintf(stderr, "ERROR: preDecrementDecodedValue() called with zero size!\n");
+ abort();
+ }
+ if (siz > 64) {
+ fprintf(stderr, "ERROR: getDecodedValue() called with huge size ("uint64FMT")!\n", siz);
+ abort();
+ }
+#endif
+
+ if (b1 >= siz) {
+ ret = ptr[wrd] >> (b1 - siz);
+
+ ret--;
+ ret &= uint64MASK(siz);
+
+ ptr[wrd] &= ~( uint64MASK(siz) << (b1 - siz) );
+ ptr[wrd] |= ret << (b1 - siz);
+ } else {
+ bit = siz - b1;
+
+ ret = (ptr[wrd] & uint64MASK(b1)) << bit;
+ ret |= (ptr[wrd+1] >> (64 - bit)) & uint64MASK(bit);
+
+ ret--;
+ ret &= uint64MASK(siz);
+
+ ptr[wrd] &= ~uint64MASK(b1);
+ ptr[wrd] |= (ret & (uint64MASK(b1) << (bit))) >> (bit);
+ wrd++;
+ ptr[wrd] &= ~(uint64MASK(bit) << (64 - bit));
+ ptr[wrd] |= (ret & (uint64MASK(bit))) << (64 - bit);
+ }
+
+ return(ret);
+}
+
+
+
+inline
+uint64
+postIncrementDecodedValue(uint64 *ptr,
+ uint64 pos,
+ uint64 siz) {
+ uint64 wrd = (pos >> 6) & 0x0000cfffffffffffllu;
+ uint64 bit = (pos ) & 0x000000000000003fllu;
+ uint64 b1 = 64 - bit;
+ uint64 ret = 0;
+
+#ifdef CHECK_WIDTH
+ if (siz == 0) {
+ fprintf(stderr, "ERROR: postIncrementDecodedValue() called with zero size!\n");
+ abort();
+ }
+ if (siz > 64) {
+ fprintf(stderr, "ERROR: getDecodedValue() called with huge size ("uint64FMT")!\n", siz);
+ abort();
+ }
+#endif
+
+ if (b1 >= siz) {
+ ret = ptr[wrd] >> (b1 - siz);
+
+ ret++;
+ ret &= uint64MASK(siz);
+
+ ptr[wrd] &= ~( uint64MASK(siz) << (b1 - siz) );
+ ptr[wrd] |= ret << (b1 - siz);
+ } else {
+ bit = siz - b1;
+
+ ret = (ptr[wrd] & uint64MASK(b1)) << bit;
+ ret |= (ptr[wrd+1] >> (64 - bit)) & uint64MASK(bit);
+
+ ret++;
+ ret &= uint64MASK(siz);
+
+ ptr[wrd] &= ~uint64MASK(b1);
+ ptr[wrd] |= (ret & (uint64MASK(b1) << (bit))) >> (bit);
+ wrd++;
+ ptr[wrd] &= ~(uint64MASK(bit) << (64 - bit));
+ ptr[wrd] |= (ret & (uint64MASK(bit))) << (64 - bit);
+ }
+
+ ret--;
+ ret &= uint64MASK(siz);
+
+ return(ret);
+}
+
+
+
+
+
+inline
+uint64
+postDecrementDecodedValue(uint64 *ptr,
+ uint64 pos,
+ uint64 siz) {
+ uint64 wrd = (pos >> 6) & 0x0000cfffffffffffllu;
+ uint64 bit = (pos ) & 0x000000000000003fllu;
+ uint64 b1 = 64 - bit;
+ uint64 ret = 0;
+
+#ifdef CHECK_WIDTH
+ if (siz == 0) {
+ fprintf(stderr, "ERROR: postDecrementDecodedValue() called with zero size!\n");
+ abort();
+ }
+ if (siz > 64) {
+ fprintf(stderr, "ERROR: getDecodedValue() called with huge size ("uint64FMT")!\n", siz);
+ abort();
+ }
+#endif
+
+ if (b1 >= siz) {
+ ret = ptr[wrd] >> (b1 - siz);
+
+ ret--;
+ ret &= uint64MASK(siz);
+
+ ptr[wrd] &= ~( uint64MASK(siz) << (b1 - siz) );
+ ptr[wrd] |= ret << (b1 - siz);
+ } else {
+ bit = siz - b1;
+
+ ret = (ptr[wrd] & uint64MASK(b1)) << bit;
+ ret |= (ptr[wrd+1] >> (64 - bit)) & uint64MASK(bit);
+
+ ret--;
+ ret &= uint64MASK(siz);
+
+ ptr[wrd] &= ~uint64MASK(b1);
+ ptr[wrd] |= (ret & (uint64MASK(b1) << (bit))) >> (bit);
+ wrd++;
+ ptr[wrd] &= ~(uint64MASK(bit) << (64 - bit));
+ ptr[wrd] |= (ret & (uint64MASK(bit))) << (64 - bit);
+ }
+
+ ret++;
+ ret &= uint64MASK(siz);
+
+ return(ret);
+}
+
+
+
+#endif // BRI_BITPACKING_H
diff --git a/libutil/bzipBuffer.C b/libutil/bzipBuffer.C
new file mode 100644
index 0000000..52147d3
--- /dev/null
+++ b/libutil/bzipBuffer.C
@@ -0,0 +1,238 @@
+#include "util++.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+
+
+// This is probably correct, it just cannot read a normal *.bz file;
+// it probably reads an unpackaged raw bzip stream.
+
+
+bzipBuffer::bzipBuffer(const char *filename, uint32 bufferMax) {
+
+ _filename = new char [strlen(filename) + 1];
+ strcpy(_filename, filename);
+
+ if (bufferMax == 0)
+ bufferMax = 32 * 1024;
+
+ errno = 0;
+ _file = open(filename, O_RDONLY | O_LARGEFILE);
+ if (errno) {
+ fprintf(stderr, "bzipBuffer()-- couldn't open the file '%s': %s\n",
+ filename, strerror(errno));
+ exit(1);
+ }
+
+ _filePos = 0;
+ _eof = false;
+
+ _bzip2bufferMax = bufferMax;
+ _bzip2inPos = 0;
+ _bzip2outPos = 0;
+
+ _bzip2in = new char [_bzip2bufferMax];
+ _bzip2out = new char [_bzip2bufferMax];
+
+ _bzip2streamEnd = false;
+
+ _bzip2stream.next_in = _bzip2in;
+ _bzip2stream.avail_in = 0;
+ _bzip2stream.total_in_lo32 = 0;
+ _bzip2stream.total_in_hi32 = 0;
+ _bzip2stream.next_out = _bzip2out;
+ _bzip2stream.avail_out = 0;
+ _bzip2stream.total_out_lo32 = 0;
+ _bzip2stream.total_out_hi32 = 0;
+ _bzip2stream.state = 0L;
+ _bzip2stream.bzalloc = 0L;
+ _bzip2stream.bzfree = 0L;
+ _bzip2stream.opaque = 0L;
+
+ int res = BZ2_bzDecompressInit(&_bzip2stream, 0, 0);
+ if (res != BZ_OK) {
+ // BZ_CONFIG_ERROR, BZ_PARAM_ERROR, BZ_MEM_ERROR
+ fprintf(stderr, "bzipBuffer::bzipBuffer()-- Failed to initialize the decompressor.\n");
+ exit(1);
+ }
+
+ fillBuffer();
+}
+
+
+bzipBuffer::~bzipBuffer() {
+ delete [] _bzip2in;
+ delete [] _bzip2out;
+ close(_file);
+}
+
+
+void
+bzipBuffer::fillBuffer(void) {
+
+ if (_bzip2streamEnd) {
+ _eof = true;
+ return;
+ }
+
+ // Scream and holler if the bzip2 buffer isn't exhausted!
+ //
+ if (_bzip2outPos < _bzip2stream.avail_out) {
+ fprintf(stderr, "bzipBuffer::fillBuffer()-- Buffer isn't empty! Still %d bytes!\n",
+ (int)(_bzip2stream.avail_out - _bzip2outPos));
+ return;
+ }
+
+ _bzip2outPos = 0;
+
+ again:
+
+ // If there is stuff in the input, run the decompressor. If it
+ // decompresses anything, return.
+ //
+ if (_bzip2stream.avail_in > 0) {
+
+ fprintf(stderr, "about to decompress %d bytes in input\n", (int)_bzip2stream.avail_in);
+ fprintf(stderr, "in is bzip2:%p and real:%p (diff %d)\n", _bzip2stream.next_in, _bzip2in, _bzip2stream.next_in - _bzip2in);
+ fprintf(stderr, "out is bzip2:%p and real:%p (diff %d)\n", _bzip2stream.next_out, _bzip2out, _bzip2stream.next_out - _bzip2out);
+
+
+ int res = BZ2_bzDecompress(&_bzip2stream);
+ if (res == BZ_STREAM_END) {
+ fprintf(stderr, "GOT STREAM END!\n");
+
+ BZ2_bzDecompressEnd(&_bzip2stream);
+
+ _bzip2streamEnd = true;
+ res = BZ_OK;
+ }
+ if (res != BZ_OK) {
+ fprintf(stderr, "bzipBuffer::fillBuffer()-- Failed to decompress.\n"), exit(1);
+ }
+
+ fprintf(stderr, "decompressed %d bytes; still have %d in input\n", (int)_bzip2stream.avail_out, (int)_bzip2stream.avail_in);
+ fprintf(stderr, "in is bzip2:%p and real:%p (diff %d)\n", _bzip2stream.next_in, _bzip2in, _bzip2stream.next_in - _bzip2in);
+ fprintf(stderr, "out is bzip2:%p and real:%p (diff %d)\n", _bzip2stream.next_out, _bzip2out, _bzip2stream.next_out - _bzip2out);
+
+ if (_bzip2stream.avail_out > 0) {
+ fprintf(stderr, "----------------------------------------\n");
+ fwrite(_bzip2stream.next_out, sizeof(char), _bzip2stream.avail_out, stderr);
+ fprintf(stderr, "\n----------------------------------------\n");
+ return;
+ }
+ }
+
+ // If we're here and _bzip2streamEnd is true, we hit the end of the
+ // stream at the same time we hit the end of the input data.
+ //
+ if (_bzip2streamEnd) {
+ _eof = true;
+ return;
+ }
+
+ // Otherwise, we need to read some input.
+ //
+ errno = 0;
+ _bzip2stream.next_in = _bzip2in;
+ _bzip2stream.avail_in = (uint32)::read(_file, _bzip2in, sizeof(char) * _bzip2bufferMax);
+ _bzip2stream.next_out = _bzip2out;
+ _bzip2stream.avail_out = _bzip2bufferMax;
+ if (errno) {
+ fprintf(stderr, "bzipBuffer::fillBuffer()-- read failed: %s\n", strerror(errno));
+ exit(1);
+ }
+
+ fprintf(stderr, "read %d bytes\n", (int)_bzip2stream.avail_in);
+
+ if (_bzip2stream.avail_in == 0) {
+ fprintf(stderr, "bzipBuffer::fillBuffer()-- hit end of file?\n");
+ _eof = true;
+ return;
+ }
+
+ // And now try to decompress it again
+ //
+ goto again;
+}
+
+
+bool
+bzipBuffer::seek(off_t pos) {
+ fprintf(stderr, "bzipBuffer()-- seek() not available for file '%s'.\n", _filename);
+ return(false);
+}
+
+
+size_t
+bzipBuffer::read(char *buf, size_t len) {
+
+#if 0
+ if (_fileType == 2) {
+ size_t c = 0;
+
+ while ((_bufferPos < _bufferLen) && (c < len))
+ buf[c++] = _buffer[_bufferPos++];
+
+ return(c);
+ } else {
+ // The trick here is to use the existing buffered input first,
+ // then do a direct read to get the rest.
+ //
+ // We fill the buffer again if it is empty.
+ //
+ // The number of bytes actually put into buf is returned.
+
+ size_t bCopied = 0; // Number of bytes copied into the buffer
+ size_t bRead = 0; // Number of bytes read into the buffer
+ size_t bAct = 0; // Number of bytes actually read from disk
+
+ // Easy case; the next len bytes are already in the buffer; just
+ // copy and move the position.
+ //
+ // XXX: Check the zero-left-in-buffer case
+ //
+ if (_bufferLen - _bufferPos > len) {
+ bCopied = len;
+ bRead = 0;
+
+ memcpy(buf, _buffer + _bufferPos, sizeof(char) * len);
+ _bufferPos += (uint32)len;
+ } else {
+
+ // Existing buffer not big enough. Copy what's there, then finish
+ // with a read.
+ //
+ memcpy(buf, _buffer + _bufferPos, (_bufferLen - _bufferPos) * sizeof(char));
+ bCopied = _bufferLen - _bufferPos;
+ _bufferPos = _bufferLen;
+
+ while (bCopied + bRead < len) {
+ errno = 0;
+ bAct = (uint32)::read(_file, buf + bCopied + bRead, (len - bCopied - bRead) * sizeof(char));
+ if (errno) {
+ fprintf(stderr, "bzipBuffer()-- couldn't read %d bytes from '%s': n%s\n",
+ (uint32)len * sizeof(char), _filename, strerror(errno));
+ exit(1);
+ }
+
+ // If we hit EOF, return a short read
+ if (bAct == 0) {
+ len = 0;
+ }
+ bRead += bAct;
+ }
+ }
+
+ if (_bufferPos == _bufferLen)
+ fillBuffer();
+
+ return(bCopied + bRead);
+ }
+#endif
+
+ return(0);
+}
diff --git a/libutil/bzipBuffer.H b/libutil/bzipBuffer.H
new file mode 100644
index 0000000..87360a1
--- /dev/null
+++ b/libutil/bzipBuffer.H
@@ -0,0 +1,92 @@
+#ifndef BZIP_BUFFER_H
+#define BZIP_BUFFER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <bzlib.h>
+
+#include "util.h"
+
+class bzipBuffer {
+public:
+ bzipBuffer(const char *filename, uint32 bufferMax = 32 * 1024);
+ ~bzipBuffer();
+
+ bool eof(void);
+ bool next(void);
+ char get(void);
+ char getnext(void);
+ bool seek(off_t pos);
+ size_t read(char *buf, size_t len); // read the next len bytes into the user buffer buf
+ off_t tell(void);
+
+private:
+ void fillBuffer(void);
+ void init(int fileptr, const char *filename, uint32 bufferMax);
+
+ char *_filename;
+
+ int _file;
+ off_t _filePos;
+ bool _eof;
+
+ uint32 _bzip2bufferMax;
+ uint32 _bzip2inPos;
+ uint32 _bzip2outPos;
+
+ char *_bzip2in;
+ char *_bzip2out;
+
+ bool _bzip2streamEnd;
+ bz_stream _bzip2stream;
+};
+
+
+inline
+bool
+bzipBuffer::eof(void) {
+ return(_eof);
+}
+
+
+inline
+bool
+bzipBuffer::next(void) {
+
+ if (_eof)
+ return(true);
+
+ _bzip2outPos++;
+ _filePos++;
+
+ if (_bzip2outPos >= _bzip2stream.avail_out)
+ fillBuffer();
+
+ return(_eof);
+}
+
+
+inline
+char
+bzipBuffer::get(void) {
+ return(_bzip2out[_bzip2outPos]);
+}
+
+
+inline
+char
+bzipBuffer::getnext(void) {
+ char x = _bzip2out[_bzip2outPos];
+ next();
+ return(x);
+}
+
+
+inline
+off_t
+bzipBuffer::tell(void) {
+ return(_filePos);
+}
+
+
+#endif // BZIP_BUFFER_H
diff --git a/libutil/eliasDeltaEncoding.h b/libutil/eliasDeltaEncoding.h
new file mode 100644
index 0000000..f034534
--- /dev/null
+++ b/libutil/eliasDeltaEncoding.h
@@ -0,0 +1,33 @@
+#ifndef ELIAS_DELTA_ENCODING_H
+#define ELIAS_DELTA_ENCODING_H
+
+#include "bitPacking.h"
+
+inline
+void
+setEliasDeltaEncodedNumber(uint64 *ptr,
+ uint64 pos,
+ uint64 *siz,
+ uint64 val) {
+ uint64 b = logBaseTwo64(val);
+ setEliasGammaEncodedNumber(ptr, pos, siz, b);
+ pos += *siz;
+ setDecodedValue(ptr, pos, b-1, val);
+ *siz += b-1;
+}
+
+
+inline
+uint64
+getEliasDeltaEncodedNumber(uint64 *ptr,
+ uint64 pos,
+ uint64 *siz) {
+ uint64 b = getEliasGammaEncodedNumber(ptr, pos, siz) - 1;
+ pos += *siz;
+ *siz += b;
+ return(uint64ONE << b | getDecodedValue(ptr, pos, b));
+}
+
+
+
+#endif // ELIAS_DELTA_ENCODING_H
diff --git a/libutil/eliasGammaEncoding.h b/libutil/eliasGammaEncoding.h
new file mode 100644
index 0000000..de88f0a
--- /dev/null
+++ b/libutil/eliasGammaEncoding.h
@@ -0,0 +1,33 @@
+#ifndef ELIAS_GAMMA_ENCODING_H
+#define ELIAS_GAMMA_ENCODING_H
+
+#include "bitPacking.h"
+
+inline
+void
+setEliasGammaEncodedNumber(uint64 *ptr,
+ uint64 pos,
+ uint64 *siz,
+ uint64 val) {
+ uint64 b = logBaseTwo64(val);
+ setUnaryEncodedNumber(ptr, pos, siz, b);
+ pos += *siz;
+ setDecodedValue(ptr, pos, b, val);
+ *siz += b;
+}
+
+
+inline
+uint64
+getEliasGammaEncodedNumber(uint64 *ptr,
+ uint64 pos,
+ uint64 *siz) {
+ uint64 b = getUnaryEncodedNumber(ptr, pos, siz);
+ pos += *siz;
+ *siz += b;
+ return(getDecodedValue(ptr, pos, b));
+}
+
+
+
+#endif // ELIAS_GAMMA_ENCODING_H
diff --git a/libutil/endianess.H b/libutil/endianess.H
new file mode 100644
index 0000000..72f8036
--- /dev/null
+++ b/libutil/endianess.H
@@ -0,0 +1,64 @@
+#ifndef ENDIANESS_H
+#define ENDIANESS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util.h"
+
+// We need to test how to swap off_t and size_t
+
+// See also test/endianess.c
+
+// If we wanted to convert to network order for everything, rather
+// than convert only when needed, this would be useful.
+//
+#if 0
+bool
+checkEndianessSwapNeeded(void) {
+
+ union u64 {
+ uint64 u;
+ unsigned char c[8];
+ };
+ union u32 {
+ uint32 u;
+ unsigned char c[4];
+ };
+ union u16 {
+ uint16 u;
+ unsigned char c[2];
+ };
+
+ u64 u64t.u = uint64NUMBER(0x0123456789abcdef);
+
+ return(u64t.c[0] != 0x0f)
+}
+#endif
+
+
+inline
+uint64
+uint64Swap(uint64 x) {
+ x = ((x >> 8) & uint64NUMBER(0x00ff00ff00ff00ff)) | ((x << 8) & uint64NUMBER(0xff00ff00ff00ff00));
+ x = ((x >> 16) & uint64NUMBER(0x0000ffff0000ffff)) | ((x << 16) & uint64NUMBER(0xffff0000ffff0000));
+ x = ((x >> 32) & uint64NUMBER(0x00000000ffffffff)) | ((x << 32) & uint64NUMBER(0xffffffff00000000));
+ return(x);
+}
+
+inline
+uint32
+uint32Swap(uint32 x) {
+ x = ((x >> 8) & uint32NUMBER(0x00ff00ff)) | ((x << 8) & uint32NUMBER(0xff00ff00));
+ x = ((x >> 16) & uint32NUMBER(0x0000ffff)) | ((x << 16) & uint32NUMBER(0xffff0000));
+ return(x);
+}
+
+inline
+uint16
+uint16Swap(uint16 x) {
+ x = ((x >> 8) & 0x00ff) | ((x << 8) & 0xff00);
+ return(x);
+}
+
+#endif // ENDIANESS_H
diff --git a/libutil/fibonacciEncoding.h b/libutil/fibonacciEncoding.h
new file mode 100644
index 0000000..e4c0c3b
--- /dev/null
+++ b/libutil/fibonacciEncoding.h
@@ -0,0 +1,171 @@
+#ifndef FIBONACCI_ENCODING_H
+#define FIBONACCI_ENCODING_H
+
+#include "bitPacking.h"
+
+// Routines to store and retrieve a Fibonacci encoded number to/from a
+// bit packed word array based at 'ptr' and currently at location
+// 'pos'. Both routines return the size of the encoded number in
+// 'siz'.
+//
+// FibEncoding can store values up to 17,167,680,177,565 (slightly
+// below 2^45, so at most a 44-bit number) in a 64-bit quantity.
+//
+// 93 bits (92 + 1) are needed to store up to 64-bit values.
+//
+// Remember that since we can't store 0, we increment all incoming
+// values, so the actual space used is:
+//
+// #### bits
+// 0 2
+// 1 3
+// 2 4
+// 3 4
+// 4 5
+// 5 5
+// 6 5
+// 7 6
+// 8 6
+// 9 6
+// 10 6
+// 11 6
+// 12 7
+// 20 8
+// 33 9
+// 54 10
+// 88 11
+// 143 12
+// 232 13
+// 376 14
+// 609 15
+// 986 16
+// 1596 17
+// 2583 18
+// 4180 19
+// 6764 20
+// 10945 21
+// 17710 22
+// 28656 23
+// 46387 24
+// 75024 25
+// 121392 26
+
+extern uint32 fibonacciValuesLen;
+extern uint64 fibonacciValues[92];
+
+inline
+void
+setFibonacciEncodedNumber(uint64 *ptr,
+ uint64 pos,
+ uint64 *siz,
+ uint64 val) {
+ uint64 out1 = uint64ZERO;
+ uint64 out2 = uint64ZERO;
+ uint32 fib = fibonacciValuesLen;
+ uint32 fibmax = uint64ZERO;
+
+ // We cannot store zero as a fibonacci number, so we simply
+ // increase everything by one.
+ //
+ val++;
+
+ // Estimate a starting point for our search; we need a function
+ // that is always slightly more than fib()
+ //
+ // Find the highest bit set, do a lookup
+ //
+ // XXX: Still need this!
+
+ while (fib-- > 0) {
+ if (val >= fibonacciValues[fib]) {
+ if (fib >= 64)
+ out2 |= uint64ONE << (127 - fib);
+ else
+ out1 |= uint64ONE << (63 - fib);
+
+ val -= fibonacciValues[fib];
+
+ if (fibmax == uint64ZERO) {
+ fibmax = fib + 1;
+ if (fibmax >= 64)
+ out2 |= uint64ONE << (127 - fibmax);
+ else
+ out1 |= uint64ONE << (63 - fibmax);
+ }
+ }
+ }
+
+ fibmax++;
+
+ // Write the encoded numbers to the stream
+ //
+ if (fibmax > 64) {
+ setDecodedValue(ptr, pos, 64, out1);
+ pos += 64;
+ out2 >>= (128 - fibmax);
+ setDecodedValue(ptr, pos, fibmax - 64, out2);
+ } else {
+ out1 >>= (64 - fibmax);
+ setDecodedValue(ptr, pos, fibmax, out1);
+ }
+
+ *siz = fibmax;
+}
+
+
+
+
+
+inline
+uint64
+getFibonacciEncodedNumber(uint64 *ptr,
+ uint64 pos,
+ uint64 *siz) {
+ uint64 wrd = (pos >> 6) & 0x0000cfffffffffffllu;
+ uint64 sft = 0x8000000000000000llu >> (pos & 0x000000000000003fllu);
+ uint64 val = 0;
+ uint32 fib = 0;
+ uint64 newbit;
+ uint64 oldbit;
+
+ oldbit = ptr[wrd] & sft;
+ sft >>= 1;
+ if (sft == uint64ZERO) {
+ wrd++;
+ sft = 0x8000000000000000llu;
+ }
+
+ newbit = ptr[wrd] & sft;
+ sft >>= 1;
+ if (sft == uint64ZERO) {
+ wrd++;
+ sft = 0x8000000000000000llu;
+ }
+
+ while (!oldbit || !newbit) {
+ if (oldbit)
+ val += fibonacciValues[fib];
+
+ fib++;
+
+ oldbit = newbit;
+ newbit = ptr[wrd] & sft;
+ sft >>= 1;
+ if (sft == uint64ZERO) {
+ wrd++;
+ sft = 0x8000000000000000llu;
+ }
+ }
+
+ val += fibonacciValues[fib];
+
+ (*siz) = fib + 2;
+
+ // We stored val+1, remember? Probably not, because the encoder is
+ // next.
+ //
+ return(val - 1);
+}
+
+
+#endif // FIBONACCI_ENCODING_H
diff --git a/libutil/fibonacciNumbers.C b/libutil/fibonacciNumbers.C
new file mode 100644
index 0000000..6102aa1
--- /dev/null
+++ b/libutil/fibonacciNumbers.C
@@ -0,0 +1,108 @@
+#include "util.h"
+
+//
+// Argh, 64-bit guys use LU as their modifier, but 32-bit guys use LLU.
+//
+
+#ifdef TRUE64BIT
+#define _(VAL) VAL ## LU
+#else
+#define _(VAL) VAL ## LLU
+#endif
+
+uint32
+fibonacciValuesLen = 92;
+
+uint64
+fibonacciValues[92] = { _(1),
+ _(2),
+ _(3),
+ _(5),
+ _(8),
+ _(13),
+ _(21),
+ _(34),
+ _(55),
+ _(89),
+ _(144),
+ _(233),
+ _(377),
+ _(610),
+ _(987),
+ _(1597),
+ _(2584),
+ _(4181),
+ _(6765),
+ _(10946),
+ _(17711),
+ _(28657),
+ _(46368),
+ _(75025),
+ _(121393),
+ _(196418),
+ _(317811),
+ _(514229),
+ _(832040),
+ _(1346269),
+ _(2178309),
+ _(3524578),
+ _(5702887),
+ _(9227465),
+ _(14930352),
+ _(24157817),
+ _(39088169),
+ _(63245986),
+ _(102334155),
+ _(165580141),
+ _(267914296),
+ _(433494437),
+ _(701408733),
+ _(1134903170),
+ _(1836311903),
+ _(2971215073),
+ _(4807526976),
+ _(7778742049),
+ _(12586269025),
+ _(20365011074),
+ _(32951280099),
+ _(53316291173),
+ _(86267571272),
+ _(139583862445),
+ _(225851433717),
+ _(365435296162),
+ _(591286729879),
+ _(956722026041),
+ _(1548008755920),
+ _(2504730781961),
+ _(4052739537881),
+ _(6557470319842),
+ _(10610209857723),
+ _(17167680177565),
+ _(27777890035288),
+ _(44945570212853),
+ _(72723460248141),
+ _(117669030460994),
+ _(190392490709135),
+ _(308061521170129),
+ _(498454011879264),
+ _(806515533049393),
+ _(1304969544928657),
+ _(2111485077978050),
+ _(3416454622906707),
+ _(5527939700884757),
+ _(8944394323791464),
+ _(14472334024676221),
+ _(23416728348467685),
+ _(37889062373143906),
+ _(61305790721611591),
+ _(99194853094755497),
+ _(160500643816367088),
+ _(259695496911122585),
+ _(420196140727489673),
+ _(679891637638612258),
+ _(1100087778366101931),
+ _(1779979416004714189),
+ _(2880067194370816120),
+ _(4660046610375530309),
+ _(7540113804746346429),
+ _(12200160415121876738) };
diff --git a/libutil/file.c b/libutil/file.c
new file mode 100644
index 0000000..6c3fbc3
--- /dev/null
+++ b/libutil/file.c
@@ -0,0 +1,446 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include <sys/statvfs.h>
+
+#include "util.h"
+
+
+int
+isHuman(FILE *F) {
+ return(isatty(fileno(F)));
+}
+
+
+#ifdef __alpha
+unsigned long __sbrk_override = 1; // See malloc(3) for details.
+
+#define MMAPFLAGS (MAP_FILE | MAP_VARIABLE | MAP_SHARED)
+#endif
+
+#ifdef _AIX
+#define MMAPFLAGS (MAP_FILE | MAP_VARIABLE | MAP_SHARED)
+#endif
+
+#ifdef __CYGWIN__
+#define MMAPFLAGS (MAP_FILE | MAP_SHARED)
+#endif
+
+#ifdef __linux
+#define MMAPFLAGS (MAP_FILE | MAP_SHARED)
+#endif
+
+#ifdef __FreeBSD__
+#define MMAPFLAGS (MAP_FILE | MAP_SHARED)
+#endif
+
+#ifdef __sun
+#define MMAPFLAGS (MAP_SHARED)
+#endif
+
+#ifdef __APPLE__
+#define MMAPFLAGS (MAP_FILE | MAP_SHARED)
+#endif
+
+
+
+
+FILE*
+makeTempFile(char *path) {
+ char template[PATH_MAX + 1];
+ int fildes;
+ FILE *F;
+
+ if (path) {
+ strcpy(template, path);
+ strcat(template, "/XXXXXX");
+ } else {
+ strcpy(template, "XXXXXX");
+ }
+
+ errno = 0;
+ fildes = mkstemp(template);
+ if (errno) {
+ fprintf(stderr, "Failed to create temporary file '%s': %s\n", template, strerror(errno));
+ exit(1);
+ }
+
+ errno = 0;
+ F = fdopen(fildes, "w+");
+ if (errno) {
+ fprintf(stderr, "Failed to open temporary file '%s': %s\n", template, strerror(errno));
+ exit(1);
+ }
+
+ errno = 0;
+ unlink(template);
+ if (errno) {
+ fprintf(stderr, "Failed to hide temporary file '%s': %s\n", template, strerror(errno));
+ exit(1);
+ }
+
+ return(F);
+}
+
+
+
+
+
+
+
+
+
+
+void*
+mapFile(const char *filename, uint64 *length, char mode) {
+ void *ptr = 0L;
+ struct stat sb;
+ int f;
+ int openMode = O_RDONLY | O_LARGEFILE;
+ int mapMode = O_RDWR | O_LARGEFILE;
+
+ switch (mode) {
+ case 'r':
+ openMode = O_RDONLY | O_LARGEFILE;
+ mapMode = PROT_READ;
+ break;
+ case 'w':
+ openMode = O_RDWR | O_LARGEFILE;
+ mapMode = PROT_READ | PROT_WRITE;
+ break;
+ default:
+ fprintf(stderr, "Invalid mode to mapFile; must be 'r' or 'w'\n");
+ exit(1);
+ break;
+ }
+
+ errno = 0;
+ f = open(filename, openMode);
+ if (errno) {
+ fprintf(stderr, "Couldn't open() '%s'\n%s\n", filename, strerror(errno));
+ exit(1);
+ }
+
+ fstat(f, &sb);
+ if (errno) {
+ fprintf(stderr, "Couldn't fstat() '%s'\n%s\n", filename, strerror(errno));
+ exit(1);
+ }
+
+ *length = sb.st_size;
+
+ ptr = mmap(0L, *length, mapMode, MMAPFLAGS, f, (off_t)0);
+ if (errno) {
+ fprintf(stderr, "Couldn't mmap() '%s'\n%s\n", filename, strerror(errno));
+ exit(1);
+ }
+
+ close(f);
+
+ return(ptr);
+}
+
+
+
+void
+unmapFile(void *addr, uint64 length) {
+#ifdef __sun
+ // This might work in general, but sun definitely needs the cast.
+ //
+ (void)munmap((caddr_t)addr, length);
+#else
+ (void)munmap(addr, length);
+#endif
+}
+
+
+
+
+
+// Copies all of srcFile to dstFile, returns the number of bytes written
+//
+off_t
+copyFile(char *srcName, FILE *dstFile) {
+ off_t srcSize = 0;
+ off_t bytesRemain = 0;
+ off_t bytesRead = 0;
+ int bufferSize = 1024 * 1024;
+ char *buffer = 0L;
+ FILE *srcFile = 0L;
+
+ buffer = (char *)malloc(sizeof(char) * bufferSize);
+ if (buffer == 0L) {
+ fprintf(stderr, "copyFile()-- Can't allocate buffer.\n");
+ exit(1);
+ }
+
+ srcSize = sizeOfFile(srcName);
+ bytesRemain = srcSize;
+
+ errno = 0;
+ srcFile = fopen(srcName, "r");
+ if (errno) {
+ fprintf(stderr, "copyFile()-- failed to open the '%s' during merge: %s\n", srcName, strerror(errno));
+ exit(1);
+ }
+
+ while (bytesRemain > 0) {
+
+ errno = 0;
+
+ if (bytesRemain > bufferSize)
+ bytesRead = fread(buffer, sizeof(char), (size_t)bufferSize, srcFile);
+ else
+ bytesRead = fread(buffer, sizeof(char), (size_t)bytesRemain, srcFile);
+
+ if (errno) {
+ fprintf(stderr, "copyFile()-- Error reading source: %s\n", strerror(errno));
+ exit(1);
+ }
+
+ if (bytesRead == 0) {
+ fprintf(stderr, "copyFile()-- Short read (%d bytes) on source: %s\n", (int)bytesRead, strerror(errno));
+ exit(1);
+ }
+
+ if (bytesRead > 0) {
+ fwrite(buffer, sizeof(char), (size_t)bytesRead, dstFile);
+
+ if (errno) {
+ fprintf(stderr, "copyFile()-- Error writing %d bytes to destination: %s\n", (int)bytesRead, strerror(errno));
+ exit(1);
+ }
+ }
+
+ bytesRemain -= bytesRead;
+ }
+
+ fclose(srcFile);
+ free(buffer);
+
+ return(srcSize);
+}
+
+
+
+
+
+// Takes a path to a file (that possibly doesn't exist) and returns
+// the number of MB (1048576 bytes) free in the directory of that
+// file.
+//
+uint32
+freeDiskSpace(char *path) {
+ char *p, *t;
+ struct statvfs dst;
+ struct stat fst;
+ uint64 ret = 0;
+
+ // Stat the path; if it exists, we're golden.
+ //
+ if (stat(path, &fst) == 0) {
+ if (statvfs(path, &dst) == -1) {
+ perror("statvfs");
+ exit(1);
+ }
+ } else {
+ // Doesn't exist. Try to find the directory that the file goes into.
+ //
+ // Copy the input path to a temporary string. Strip off
+ // the last component (probably a file prefix, but it could also
+ // be a directory -- see below) and return the free space on
+ // that device.
+ //
+ p = (char *)malloc(sizeof(char) * (strlen(path) + 1));
+ strcpy(p, path);
+ t = strrchr(p, '/');
+
+ if (t) {
+ *t = 0;
+ } else {
+ p[0] = '.';
+ p[1] = 0;
+ }
+
+ if (statvfs(p, &dst) == -1) {
+ perror("statvfs");
+ exit(1);
+ }
+
+ free(p);
+ }
+
+ ret = dst.f_frsize;
+ ret *= dst.f_bavail;
+ ret >>= 20;
+
+ return((uint32)ret);
+}
+
+
+
+
+
+
+// Split writes/reads into smaller pieces, check the result of each
+// piece. Really needed by OSF1 (V5.1).
+//
+void
+safeWrite(int filedes, const void *buffer, const char *desc, size_t nbytes) {
+ size_t position = 0;
+ size_t length = 32 * 1024 * 1024;
+ size_t towrite = 0;
+ size_t written = 0;
+
+ while (position < nbytes) {
+ towrite = length;
+ if (position + towrite > nbytes)
+ towrite = nbytes - position;
+
+ errno = 0;
+ written = write(filedes, ((char *)buffer) + position, towrite);
+
+ if ((errno) || (towrite != written)) {
+ fprintf(stderr, "safeWrite()-- Write failure on %s: %s\n", desc, strerror(errno));
+ fprintf(stderr, "safeWrite()-- Wanted to write "int64FMT" bytes, wrote "int64FMT".\n", (int64)towrite, (int64)written);
+ exit(1);
+ }
+
+ position += written;
+ }
+}
+
+int
+safeRead(int filedes, const void *buffer, const char *desc, size_t nbytes) {
+ size_t position = 0;
+ size_t length = 32 * 1024 * 1024;
+ size_t toread = 0;
+ size_t written = 0; // readen?
+ int failed = 0;
+
+ while (position < nbytes) {
+ toread = length;
+ if (position + toread > nbytes)
+ toread = nbytes - position;
+
+ errno = 0;
+ written = read(filedes, ((char *)buffer) + position, toread);
+
+ failed = errno;
+#ifdef VERY_SAFE
+ if (toread != written)
+ failed = 1;
+#endif
+
+ if ((failed) && (errno != EINTR)) {
+ fprintf(stderr, "safeRead()-- Read failure on %s: %s.\n", desc, strerror(errno));
+ fprintf(stderr, "safeRead()-- Wanted to read "int64FMT" bytes, read "int64FMT".\n", (int64)toread, (int64)written);
+ exit(1);
+ }
+
+ if (written == 0)
+ break;
+
+ position += written;
+ }
+
+ return(position);
+}
+
+
+
+void
+closeFile(FILE *F, const char *path) {
+
+ // If we're given the path name, see if we need to pclose(),
+ // otherwise just fclose() the file.
+
+ if ((path) &&
+ ((strcmp(path + strlen(path) - 4, ".bz2") == 0) ||
+ (strcmp(path + strlen(path) - 3, ".gz") == 0))) {
+ pclose(F);
+ } else {
+ fclose(F);
+ }
+}
+
+FILE*
+openFile(const char *path, const char *mode) {
+ FILE *F = 0L;
+ int isBz = 0;
+ int isGz = 0;
+ int isRead = 0;
+ int isWrite = 0;
+ int isRW = 1;
+ char cmd[1024] = { 0 };;
+
+ // Yes, one could make this significantly simpler by saving the
+ // compression command into a variable, instead of the isBz and
+ // isGz flags. Maybe instead we should find a compression command
+ // that uses different flags.
+
+ if (strcmp(path + strlen(path) - 4, ".bz2") == 0)
+ isBz = 1;
+ if (strcmp(path + strlen(path) - 3, ".gz") == 0)
+ isGz = 1;
+
+ if (strcmp(mode, "w") == 0) {
+ isRead = 0;
+ isWrite = 1;
+ isRW = 0;
+ }
+ if (strcmp(mode, "r") == 0) {
+ isRead = 1;
+ isWrite = 0;
+ isRW = 0;
+ }
+
+ if (isBz) {
+ if (isRead) {
+ sprintf(cmd, "bzip2 -dc %s", path);
+ } else if (isWrite) {
+ sprintf(cmd, "bzip2 -9c > %s", path);
+ } else {
+ fprintf(stderr, "openFile()-- Error! Requested mode '%s' unavailable for bzip2 file '%s'\n", mode, path);
+ exit(1);
+ }
+ } else if (isGz) {
+ if (isRead) {
+ sprintf(cmd, "gzip -dc %s", path);
+ } else if (isWrite) {
+ sprintf(cmd, "gzip -9c > %s", path);
+ } else {
+ fprintf(stderr, "openFile()-- Error! Requested mode '%s' unavailable for gzip file '%s'\n", mode, path);
+ exit(1);
+ }
+ } else {
+ // Must be a normal file!
+ }
+
+
+ if (cmd[0]) {
+ errno = 0;
+ F = popen(cmd, mode);
+ // popen doesn't reliably set errnoman
+ //if (errno)
+ // fprintf(stderr, "openFile()-- Failed to open pipe '%s': %s\n", cmd, strerror(errno)), exit(1);
+ if (F == 0L)
+ fprintf(stderr, "openFile()-- Failed to open pipe '%s'\n", cmd), exit(1);
+ } else {
+ errno = 0;
+ F = fopen(path, mode);
+ if (errno)
+ fprintf(stderr, "openFile()-- Failed to open '%s': %s\n", path, strerror(errno)), exit(1);
+ }
+
+ return(F);
+}
+
diff --git a/libutil/generalizedUnaryEncoding.h b/libutil/generalizedUnaryEncoding.h
new file mode 100644
index 0000000..1392f7a
--- /dev/null
+++ b/libutil/generalizedUnaryEncoding.h
@@ -0,0 +1,116 @@
+#ifndef GENERALIZED_UNARY_ENCODING_H
+#define GENERALIZED_UNARY_ENCODING_H
+
+#include "bitPacking.h"
+
+// Lots and lots of semi-useless debugging information
+//#define DEBUG_GENERALIZEDUNARYENCODING
+
+
+// Generalized unary encodings. Defined by (start, step, stop).
+// This implementation uses stop=infinity to encode all possible
+// numbers. If you know the highest number possible, you'll get a
+// slight decrease in space used ...
+
+// The method:
+//
+// The mth code word consists of 'm' unary encoded, followed by w =
+// start + m * step binary encoded bits. If a == stop, then the
+// terminator in the unary code is dropped.
+//
+// Encoding is tricky. Take the 3,2,9 example:
+// m w template # vals #'s
+// 0 3 1xxx 8 0- 7
+// 1 5 01xxxxx 32 8- 39
+// 2 7 001xxxxxxx 128 40-167
+// 3 9 000xxxxxxxxx 512 168-679
+//
+// I don't see a nice way of mapping our number n to the prefix m,
+// short of some sort of search. The implementation below is
+// probably very slow.
+//
+// On the bright side, decoding is trivial. Read back the unary
+// encoded number, then read that many bits to get the value.
+//
+
+static const uint64 _genunary_start = 3;
+static const uint64 _genunary_step = 2;
+//static const uint64 _genunary_stop = ~uint64ZERO;
+
+
+inline
+void
+setGeneralizedUnaryEncodedNumber(uint64 *ptr,
+ uint64 pos,
+ uint64 *siz,
+ uint64 val) {
+ uint64 m = uint64ZERO;
+ uint64 w = _genunary_start;
+ uint64 n = uint64ONE << w;
+
+ // Search for the prefix m, given our number 'val'.
+ // While doing this, we get rid of all the implicitly stored values from 'val'.
+ //
+#ifdef DEBUG_GENERALIZEDUNARYENCODING
+ fprintf(stderr, " val="uint64FMT" try n="uint64FMT" for m="uint64FMT"\n", val, n, m);
+#endif
+
+ while (n <= val) {
+ val -= n;
+ w += _genunary_step;
+ n = uint64ONE << w;
+ m++;
+#ifdef DEBUG_GENERALIZEDUNARYENCODING
+ fprintf(stderr, " val="uint64FMT" try n="uint64FMT" for m="uint64FMT"\n", val, n, m);
+#endif
+ }
+
+#ifdef DEBUG_GENERALIZEDUNARYENCODING
+ fprintf(stderr, "val="uint64FMT" found m="uint64FMT"\n", val, m);
+#endif
+
+ // Now just encode the number
+ // m - the unary encoded prefix
+ // w - the size of the binary encoded number
+
+ setUnaryEncodedNumber(ptr, pos, siz, m);
+ setDecodedValue(ptr, pos+*siz, w, val);
+ *siz = m + 1 + w;
+}
+
+
+
+inline
+uint64
+getGeneralizedUnaryEncodedNumber(uint64 *ptr,
+ uint64 pos,
+ uint64 *siz) {
+ uint64 val = uint64ZERO;
+ uint64 m = uint64ZERO;
+ uint64 w = uint64ZERO;
+
+ // Comments in the encoder apply here too.
+
+ m = getUnaryEncodedNumber(ptr, pos, siz);
+ w = _genunary_start + m * _genunary_step;
+ val = getDecodedValue(ptr, pos + *siz, w);
+ *siz = m + 1 + w;
+
+#ifdef DEBUG_GENERALIZEDUNARYENCODING
+ fprintf(stderr, "m="uint64FMT" w="uint64FMT" val="uint64FMT"\n", m, w, val);
+#endif
+
+ // Add in the implcitly stored pieces of the number
+ //
+ while (m--) {
+ w -= _genunary_step;
+ val += uint64ONE << w;
+ }
+
+ return(val);
+}
+
+
+
+
+#endif // GENERALIZED_UNARY_ENCODING_H
diff --git a/libutil/intervalList.H b/libutil/intervalList.H
new file mode 100644
index 0000000..315cace
--- /dev/null
+++ b/libutil/intervalList.H
@@ -0,0 +1,675 @@
+#ifndef INTERVALLIST_H
+#define INTERVALLIST_H
+
+#include <algorithm>
+
+// iNum - lo, hi - coordinates of the interval
+// iVal - va - data stored at each interval
+// uint32 - ct - number of elements in this interval
+// - when merged, needs function that converts multiple iVal and a uint32 into a single iVal
+
+template <class iNum, class iVal=int32>
+class _intervalPair {
+public:
+ iNum lo;
+ iNum hi;
+ uint32 ct; // Number of source intervals
+ iVal va; // Value at this interval; default is 1
+
+ bool operator<(const _intervalPair &that) const {
+ if (lo != that.lo)
+ return(lo < that.lo);
+ return(hi < that.hi);
+ };
+};
+
+
+template <class iNum, class iVal=int32>
+class intervalDepthRegions {
+public:
+ iNum pos; // Position of the change in depth
+ iVal change; // The value associated with this object; added or subtracted from 'va'.
+ bool open; // If true, the start of a new interval
+
+ bool operator<(const intervalDepthRegions &that) const {
+ if (pos != that.pos)
+ return(pos < that.pos);
+ return(open > that.open);
+ };
+};
+
+
+
+
+
+template <class iNum, class iVal=int32>
+class intervalList {
+public:
+ intervalList(uint32 initialSize=32) {
+ _isSorted = true;
+ _isMerged = true;
+ _listLen = 0;
+ _listMax = initialSize;
+ _list = new _intervalPair<iNum, iVal> [_listMax];
+ };
+
+ // Takes as input an unmerged intervalList, returns to a new set of intervals, one
+ // for each 'depth'. Two intervals, (1,4) and (2,6) would return 'depths':
+ // 1,2,1 bgn=1, end=2, depth=1
+ // 2,4,2
+ // 4,6,1
+ //
+ intervalList(intervalList<iNum, iVal> &IL) {
+ _isSorted = false;
+ _isMerged = false;
+ _listLen = 0;
+ _listMax = 0;
+ _list = 0L;
+
+ depth(IL);
+ };
+
+ intervalList(intervalDepthRegions<iNum, iVal> *id, uint32 idlen) {
+ _isSorted = false;
+ _isMerged = false;
+ _listLen = 0;
+ _listMax = 0;
+ _list = 0L;
+
+#ifdef _GLIBCXX_PARALLEL
+ // Don't use the parallel sort, not with the expense of starting threads.
+ __gnu_sequential::sort(id, id + idlen);
+#else
+ std::sort(id, id + idlen);
+#endif
+
+ computeDepth(id, idlen);
+ };
+
+ ~intervalList() {
+ delete [] _list;
+ };
+
+ intervalList<iNum, iVal> &operator=(intervalList<iNum, iVal> &src);
+
+ void clear(void) {
+ _isSorted = true;
+ _isMerged = true;
+ _listLen = 0;
+ }
+
+ void add(iNum position, iNum length, iVal value=0);
+ void sort(void);
+ void merge(uint32 minOverlap=0); // Merge overlapping regions
+ void merge(intervalList<iNum, iVal> *IL); // Insert IL into this list
+
+ void intersect(intervalList<iNum, iVal> &A,
+ intervalList<iNum, iVal> &B);
+
+ uint32 overlapping(iNum lo,
+ iNum hi,
+ uint32 *&intervals,
+ uint32 &intervalsLen,
+ uint32 &intervalsMax);
+
+ // Populates this intervalList with regions in A that are completely
+ // contained in a region in B.
+ //
+ // Both A and B call merge().
+ //
+ void contained(intervalList<iNum, iVal> &A,
+ intervalList<iNum, iVal> &B);
+
+ void invert(iNum lo, iNum hi);
+
+ void depth(intervalList<iNum, iVal> &A);
+
+ uint32 numberOfIntervals(void) { return(_listLen); };
+
+ iNum sumOfLengths(void) {
+ iNum len = 0;
+ uint32 i = numberOfIntervals();
+
+ if (i > 0)
+ while (i--)
+ len += _list[i].hi - _list[i].lo;
+
+ return(len);
+ };
+
+ iNum &lo(uint32 i) { return(_list[i].lo); };
+ iNum &hi(uint32 i) { return(_list[i].hi); };
+
+ uint32 &count(uint32 i) { return(_list[i].ct); }; // Number of source intervals.
+ uint32 &depth(uint32 i) { return(_list[i].ct); }; // Depth, if converted.
+ iVal &value(uint32 i) { return(_list[i].va); }; // Value or sum of values.
+
+private:
+ void computeDepth(intervalDepthRegions<iNum, iVal> *id, uint32 idlen);
+
+
+ bool _isSorted;
+ bool _isMerged;
+
+ uint32 _listMax;
+ uint32 _listLen;
+ _intervalPair<iNum, iVal> *_list;
+};
+
+
+
+
+
+
+template <class iNum, class iVal>
+intervalList<iNum, iVal> &
+intervalList<iNum, iVal>::operator=(intervalList &src) {
+ _isSorted = src._isSorted;
+ _isMerged = src._isMerged;
+
+
+ if (_listMax < src._listMax) {
+ delete [] _list;
+ _listMax = src._listMax;
+ _list = new _intervalPair<iNum, iVal> [_listMax];
+ }
+
+ _listLen = src._listLen;
+
+ memcpy(_list, src._list, _listLen * sizeof(_intervalPair<iNum, iVal>));
+
+ return(*this);
+}
+
+
+template <class iNum, class iVal>
+void
+intervalList<iNum, iVal>::add(iNum position, iNum length, iVal val) {
+
+ if (_listLen >= _listMax) {
+ _listMax *= 2;
+ _intervalPair<iNum, iVal> *l = new _intervalPair<iNum, iVal> [_listMax];
+ memcpy(l, _list, sizeof(_intervalPair<iNum, iVal>) * _listLen);
+ delete [] _list;
+ _list = l;
+ }
+
+ _list[_listLen].lo = position;
+ _list[_listLen].hi = position + length;
+ _list[_listLen].ct = 1;
+ _list[_listLen].va = val;
+
+ // Could optimize, and search the list to see if these are false,
+ // but that's rather expensive.
+ _isSorted = false;
+ _isMerged = false;
+
+ _listLen++;
+}
+
+
+
+template <class iNum, class iVal>
+void
+intervalList<iNum, iVal>::sort(void) {
+
+ if (_isSorted)
+ return;
+
+ if (_listLen > 1)
+#ifdef _GLIBCXX_PARALLEL
+ // Don't use the parallel sort, not with the expense of starting threads.
+ __gnu_sequential::sort(_list, _list + _listLen);
+#else
+ std::sort(_list, _list + _listLen);
+#endif
+
+ _isSorted = true;
+}
+
+
+template <class iNum, class iVal>
+void
+intervalList<iNum, iVal>::merge(uint32 minOverlap) {
+ uint32 thisInterval = 0;
+ uint32 nextInterval = 1;
+
+ if (_isMerged)
+ return;
+
+ sort();
+
+ while (nextInterval < _listLen) {
+
+ if ((_list[thisInterval].lo == 0) &&
+ (_list[thisInterval].hi == 0)) {
+
+ // Our interval is empty. Copy in the interval we are
+ // examining and move to the next.
+
+ // XXX This is probably useless, thisInterval should always be
+ // valid.
+
+ _list[thisInterval].lo = _list[nextInterval].lo;
+ _list[thisInterval].hi = _list[nextInterval].hi;
+ _list[thisInterval].ct = _list[nextInterval].ct;
+ _list[thisInterval].ct = _list[nextInterval].va;
+
+ _list[nextInterval].lo = 0;
+ _list[nextInterval].hi = 0;
+
+ nextInterval++;
+ } else {
+
+ // This interval is valid. See if it overlaps with the next
+ // interval.
+
+ bool intersects = false;
+
+ if ((_list[thisInterval].lo <= _list[nextInterval].lo) &&
+ (_list[nextInterval].hi <= _list[thisInterval].hi))
+ // next is contained in this
+ intersects = true;
+
+ if (_list[thisInterval].hi - minOverlap >= _list[nextInterval].lo)
+ // next has thick overlap to this
+ intersects = true;
+
+
+ if (intersects) {
+
+ // Got an intersection.
+
+ // Merge nextInterval into thisInterval -- the hi range
+ // is extended if the nextInterval range is larger.
+ //
+ if (_list[thisInterval].hi < _list[nextInterval].hi)
+ _list[thisInterval].hi = _list[nextInterval].hi;
+
+ _list[thisInterval].ct += _list[nextInterval].ct;
+ _list[thisInterval].va += _list[nextInterval].va;
+
+ // Clear the just merged nextInterval and move to the next one.
+ //
+ _list[nextInterval].lo = 0;
+ _list[nextInterval].hi = 0;
+ _list[nextInterval].ct = 0;
+ _list[nextInterval].va = 0;
+
+ nextInterval++;
+ } else {
+
+ // No intersection. Move along. Nothing to see here.
+
+ // If there is a gap between the target and the examine (we
+ // must have merged sometime in the past), copy examine to
+ // the next target.
+
+ thisInterval++;
+
+ if (thisInterval != nextInterval) {
+ _list[thisInterval].lo = _list[nextInterval].lo;
+ _list[thisInterval].hi = _list[nextInterval].hi;
+ _list[thisInterval].ct = _list[nextInterval].ct;
+ _list[thisInterval].va = _list[nextInterval].va;
+ }
+
+ nextInterval++;
+ }
+ }
+ }
+
+ if (thisInterval+1 < _listLen)
+ _listLen = thisInterval + 1;
+
+ _isMerged = true;
+}
+
+
+
+template <class iNum, class iVal>
+void
+intervalList<iNum, iVal>::merge(intervalList<iNum, iVal> *IL) {
+ for (uint32 i=0; i<IL->_listLen; i++)
+ add(IL->_list[i].lo, IL->_list[i].hi - IL->_list[i].lo);
+}
+
+
+
+template <class iNum, class iVal>
+void
+intervalList<iNum, iVal>::invert(iNum invlo, iNum invhi) {
+
+ merge();
+
+ // Create a new list to store the inversion
+ //
+ uint32 invLen = 0;
+ uint32 invMax = _listLen + 2;
+ _intervalPair<iNum, iVal> *inv = new _intervalPair<iNum, iVal> [invMax];
+
+ // Add the zeroth and only?
+ if (_listLen == 0) {
+ inv[invLen].lo = invlo;
+ inv[invLen].hi = invhi;
+ inv[invLen].ct = 1;
+ inv[invLen].va = 0;
+ invLen++;
+ }
+
+ // Add the first, then the pieces, then the last
+ //
+ else {
+ if (invlo < _list[0].lo) {
+ inv[invLen].lo = invlo;
+ inv[invLen].hi = _list[0].lo;
+ inv[invLen].ct = 1;
+ inv[invLen].va = 0;
+ invLen++;
+ }
+
+ for (uint32 i=1; i<_listLen; i++) {
+ if (_list[i-1].hi < _list[i].lo) {
+ inv[invLen].lo = _list[i-1].hi;
+ inv[invLen].hi = _list[i].lo;
+ inv[invLen].ct = 1;
+ inv[invLen].va = 0;
+ invLen++;
+ }
+ }
+
+ if (_list[_listLen-1].hi < invhi) {
+ inv[invLen].lo = _list[_listLen-1].hi;
+ inv[invLen].hi = invhi;
+ inv[invLen].ct = 1;
+ inv[invLen].va = 0;
+ invLen++;
+ }
+ }
+
+ assert(invLen <= invMax);
+
+ // Nuke the old list, swap in the new one
+ delete [] _list;
+
+ _list = inv;
+ _listLen = invLen;
+ _listMax = invMax;
+}
+
+
+
+template <class iNum, class iVal>
+void
+intervalList<iNum, iVal>::intersect(intervalList<iNum, iVal> &A,
+ intervalList<iNum, iVal> &B) {
+ A.merge();
+ B.merge();
+
+ uint32 ai = 0;
+ uint32 bi = 0;
+
+ while ((ai < A.numberOfIntervals()) &&
+ (bi < B.numberOfIntervals())) {
+ uint32 al = A.lo(ai);
+ uint32 ah = A.hi(ai);
+ uint32 bl = B.lo(bi);
+ uint32 bh = B.hi(bi);
+ uint32 nl = 0;
+ uint32 nh = 0;
+
+ // If they intersect, make a new region
+ //
+ if ((al <= bl) && (bl < ah)) {
+ nl = bl;
+ nh = (ah < bh) ? ah : bh;
+ }
+
+ if ((bl <= al) && (al < bh)) {
+ nl = al;
+ nh = (ah < bh) ? ah : bh;
+ }
+
+ if (nl < nh)
+ add(nl, nh - nl);
+
+ // Advance the list with the earlier region.
+ //
+ if (ah < bh) {
+ // A ends before B
+ ai++;
+ } else if (ah > bh) {
+ // B ends before A
+ bi++;
+ } else {
+ // Exactly the same ending!
+ ai++;
+ bi++;
+ }
+ }
+}
+
+
+
+// Populates an array with the intervals that are within the supplied interval.
+//
+// Naive implementation that is easy to verify (and that works on an unsorted list).
+//
+template <class iNum, class iVal>
+uint32
+intervalList<iNum, iVal>::overlapping(iNum rangelo,
+ iNum rangehi,
+ uint32 *&intervals,
+ uint32 &intervalsLen,
+ uint32 &intervalsMax) {
+
+ if (intervals == 0L) {
+ intervalsMax = 256;
+ intervals = new uint32 [intervalsMax];
+ }
+
+ intervalsLen = 0;
+
+ for (uint32 i=0; i<_listLen; i++) {
+ if ((rangelo <= _list[i].hi) &&
+ (rangehi >= _list[i].lo)) {
+ if (intervalsLen >= intervalsMax) {
+ intervalsMax *= 2;
+ uint32 *X = new uint32 [intervalsMax];
+ memcpy(X, intervals, sizeof(uint32) * intervalsLen);
+ delete [] intervals;
+ intervals = X;
+ }
+
+ intervals[intervalsLen++] = i;
+ }
+ }
+
+ return(intervalsLen);
+}
+
+
+
+template <class iNum, class iVal>
+void
+intervalList<iNum, iVal>::contained(intervalList<iNum, iVal> &A,
+ intervalList<iNum, iVal> &B) {
+ A.merge();
+ B.merge();
+
+ uint32 ai = 0;
+ uint32 bi = 0;
+
+ while ((ai < A.numberOfIntervals()) &&
+ (bi < B.numberOfIntervals())) {
+ uint32 al = A.lo(ai);
+ uint32 ah = A.hi(ai);
+ uint32 bl = B.lo(bi);
+ uint32 bh = B.hi(bi);
+
+ // If A is contained in B, make a new region.
+ //
+ if ((bl <= al) && (ah <= bh))
+ add(bl, bh - bl);
+
+#if 0
+ if ((al <= bl) && (bh <= ah))
+ add(al, ah - al);
+#endif
+
+ // Advance the list with the earlier region.
+ //
+ if (ah < bh) {
+ // A ends before B
+ ai++;
+ } else if (ah > bh) {
+ // B ends before A
+ bi++;
+ } else {
+ // Exactly the same ending!
+ ai++;
+ bi++;
+ }
+ }
+}
+
+
+
+
+
+
+
+template <class iNum, class iVal>
+void
+intervalList<iNum, iVal>::depth(intervalList<iNum, iVal> &IL) {
+ uint32 idlen = IL.numberOfIntervals() * 2;
+ intervalDepthRegions<iNum, iVal> *id = new intervalDepthRegions<iNum, iVal> [idlen];
+
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++) {
+ id[2*i ].pos = IL.lo(i);
+ id[2*i ].change = IL.value(i);
+ id[2*i ].open = true;
+
+ id[2*i+1].pos = IL.hi(i);
+ id[2*i+1].change = IL.value(i);
+ id[2*i+1].open = false;
+ }
+
+ computeDepth(id, idlen);
+
+ delete [] id;
+}
+
+
+
+template <class iNum, class iVal>
+void
+intervalList<iNum, iVal>::computeDepth(intervalDepthRegions<iNum, iVal> *id, uint32 idlen) {
+
+ // No intervals input? No intervals output.
+
+ _listLen = 0;
+
+ if (idlen == 0)
+ return;
+
+ // Sort by coordinate.
+
+#ifdef _GLIBCXX_PARALLEL
+ // Don't use the parallel sort, not with the expense of starting threads.
+ __gnu_sequential::sort(id, id + idlen);
+#else
+ std::sort(id, id + idlen);
+#endif
+
+ // Scan the list, counting how many times we change depth.
+
+#if 0
+ uint32 lm = 1;
+
+ for (uint32 i=1; i<idlen; i++)
+ if (id[i-1].pos != id[i].pos)
+ lm++;
+#endif
+
+ // But then admit we don't really know how many, and reset to the maximum possible.
+
+ // Allocate the real depth of coverage intervals
+
+ if (_listMax < idlen) {
+ delete [] _list;
+
+ _listMax = idlen;
+ _list = new _intervalPair<iNum, iVal> [_listMax];
+ }
+
+ // Init first interval.
+
+ assert(id[0].open == true);
+
+ _list[_listLen].lo = id[0].pos;
+ _list[_listLen].hi = id[0].pos;
+ _list[_listLen].ct = 1;
+ _list[_listLen].va = id[0].change;
+
+ uint32 nct;
+ iVal nva;
+
+ for (uint32 i=1; i<idlen; i++) {
+ // Update the end of the current interval.
+ _list[_listLen].hi = id[i].pos;
+
+ // Compute the count and value of the next interval.
+ if (id[i].open == true) {
+ nct = _list[_listLen].ct + 1;
+ nva = _list[_listLen].va + id[i].change;
+ } else {
+ nct = _list[_listLen].ct - 1;
+ nva = _list[_listLen].va - id[i].change;
+ }
+
+ // If the position or value is different, make a new interval,
+ // But only if this interval is not null length.
+ if (((id[i-1].pos != id[i].pos) ||
+ (_list[_listLen].va != nva)) &&
+ (_list[_listLen].lo != _list[_listLen].hi)) {
+ _listLen++;
+
+ _list[_listLen].lo = id[i].pos;
+ _list[_listLen].ct = _list[_listLen-1].ct;
+ _list[_listLen].va = _list[_listLen-1].va;
+ }
+
+ // Finally, update whatver interval is current.
+ _list[_listLen].hi = id[i].pos;
+ _list[_listLen].ct = nct;
+ _list[_listLen].va = nva;
+
+ // Now, if this interval's begin is the same as the last interval's end,
+ // we need to merge.
+ if ((_listLen > 1) &&
+ (_list[_listLen-1].hi == _list[_listLen].lo) &&
+ (_list[_listLen-1].ct == _list[_listLen].ct) &&
+ (_list[_listLen-1].va == _list[_listLen].va)) {
+ _list[_listLen-1].hi = _list[_listLen].hi;
+ _listLen--;
+ }
+
+#if 0
+ fprintf(stderr, "id[%2d] - list[%u] = lo=%u hi=%u ct=%u va=%f\n",
+ i,
+ _listLen,
+ _list[_listLen].lo,
+ _list[_listLen].hi,
+ _list[_listLen].ct,
+ _list[_listLen].va);
+#endif
+ }
+
+ assert(_listLen > 0);
+ assert(_listLen <= _listMax);
+}
+
+
+
+#endif // INTERVALLIST_H
diff --git a/libutil/kazlib/Make.include b/libutil/kazlib/Make.include
new file mode 100644
index 0000000..7ecb369
--- /dev/null
+++ b/libutil/kazlib/Make.include
@@ -0,0 +1,27 @@
+# -*- makefile -*-
+
+src := $/dict.c \
+ $/dict.h \
+ $/except.c \
+ $/except.h \
+ $/hash.c \
+ $/hash.h \
+ $/list.c \
+ $/list.h \
+ $/sfx.c \
+ $/sfx.h
+
+tst := $/dict-main.c \
+ $/except-main.c \
+ $/hash-main.c \
+ $/list-main.c \
+ $/sfx-main.c
+
+$/.C_SRCS :=$(filter %.c,${src})
+$/.CXX_SRCS :=$(filter %.C,${src})
+$/.CXX_LIBS :=$/libkaz.a
+
+$/.CLEAN := $/*.o
+
+$/libkaz.a: ${$/.C_SRCS:.c=.o} ${$/.CXX_SRCS:.C=.o}
+
diff --git a/libutil/kazlib/blast.pl b/libutil/kazlib/blast.pl
new file mode 100755
index 0000000..63351c9
--- /dev/null
+++ b/libutil/kazlib/blast.pl
@@ -0,0 +1,33 @@
+#!/usr/bin/perl
+
+#
+# This is a program whose output can be piped to the test drivers for
+# hash.c and dict.c. It inserts a bunch of data and then deletes it all.
+#
+# The $modulus should be a prime number. This ensures that the $modulus - 1
+# generated keys are all distinct. The $factor_i and $factor_d values need not
+# be prime, but it should not be a multiple of $modulus (including zero),
+# otherwise a sequence of duplicate keys will be generated: choose numbers
+# in the range [1, $modulus - 1]. Choosing 1 means that
+# insertions (or deletions) will take place in order.
+# The purpose of using the prime modulus number is to generate a repeatable
+# sequence of unique keys that is (possibly) not in sorted order.
+#
+
+# $modulus = 200003;
+# $factor_i = 100;
+# $factor_d = 301;
+
+$modulus = 6113;
+$factor_i = 1669;
+$factor_d = 2036;
+
+for ($i = 1; $i < $modulus; $i++) {
+ printf("a %d %d\n", ($i * $factor_i) % $modulus, $i);
+}
+
+for ($i = 1; $i < $modulus; $i++) {
+ printf("d %d\n", ($i * $factor_d) % $modulus);
+}
+
+print "t\nq\n"
diff --git a/libutil/kazlib/dict.c b/libutil/kazlib/dict.c
new file mode 100644
index 0000000..cd98498
--- /dev/null
+++ b/libutil/kazlib/dict.c
@@ -0,0 +1,1238 @@
+/*
+ * Dictionary Abstract Data Type
+ * Copyright (C) 1997 Kaz Kylheku <kaz at ashi.footprints.net>
+ *
+ * Free Software License:
+ *
+ * All rights are reserved by the author, with the following exceptions:
+ * Permission is granted to freely reproduce and distribute this software,
+ * possibly in exchange for a fee, provided that this copyright notice appears
+ * intact. Permission is also granted to adapt this software to produce
+ * derivative works, as long as the modified versions carry this copyright
+ * notice and additional notices stating that the work has been modified.
+ * This source code may be translated into executable form and incorporated
+ * into proprietary software; there is no requirement for such software to
+ * contain a copyright notice related to this source.
+ *
+ */
+
+#define NDEBUG
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <assert.h>
+#define DICT_IMPLEMENTATION
+#include "dict.h"
+
+// bpw 20050309 define this to use a qsort(3) compatible sort function,
+// requiring two dereferences to get the data instead of one.
+//
+#define BE_QSORT_COMPATIBLE
+
+/*
+ * These macros provide short convenient names for structure members,
+ * which are embellished with dict_ prefixes so that they are
+ * properly confined to the documented namespace. It's legal for a
+ * program which uses dict to define, for instance, a macro called ``parent''.
+ * Such a macro would interfere with the dnode_t struct definition.
+ * In general, highly portable and reusable C modules which expose their
+ * structures need to confine structure member names to well-defined spaces.
+ * The resulting identifiers aren't necessarily convenient to use, nor
+ * readable, in the implementation, however!
+ */
+
+#define left dict_left
+#define right dict_right
+#define parent dict_parent
+#define color dict_color
+#define key dict_key
+#define data dict_data
+
+#define nilnode dict_nilnode
+#define nodecount dict_nodecount
+#define maxcount dict_maxcount
+#define compare dict_compare
+#define allocnode dict_allocnode
+#define freenode dict_freenode
+#define context dict_context
+#define dupes dict_dupes
+
+#define dictptr dict_dictptr
+
+#define dict_root(D) ((D)->nilnode.left)
+#define dict_nil(D) (&(D)->nilnode)
+#define DICT_DEPTH_MAX 64
+
+static dnode_t *dnode_alloc(void *context);
+static void dnode_free(dnode_t *node, void *context);
+
+/*
+ * Perform a ``left rotation'' adjustment on the tree. The given node P and
+ * its right child C are rearranged so that the P instead becomes the left
+ * child of C. The left subtree of C is inherited as the new right subtree
+ * for P. The ordering of the keys within the tree is thus preserved.
+ */
+
+static void rotate_left(dnode_t *upper)
+{
+ dnode_t *lower, *lowleft, *upparent;
+
+ lower = upper->right;
+ upper->right = lowleft = lower->left;
+ lowleft->parent = upper;
+
+ lower->parent = upparent = upper->parent;
+
+ /* don't need to check for root node here because root->parent is
+ the sentinel nil node, and root->parent->left points back to root */
+
+ if (upper == upparent->left) {
+ upparent->left = lower;
+ } else {
+ assert (upper == upparent->right);
+ upparent->right = lower;
+ }
+
+ lower->left = upper;
+ upper->parent = lower;
+}
+
+/*
+ * This operation is the ``mirror'' image of rotate_left. It is
+ * the same procedure, but with left and right interchanged.
+ */
+
+static void rotate_right(dnode_t *upper)
+{
+ dnode_t *lower, *lowright, *upparent;
+
+ lower = upper->left;
+ upper->left = lowright = lower->right;
+ lowright->parent = upper;
+
+ lower->parent = upparent = upper->parent;
+
+ if (upper == upparent->right) {
+ upparent->right = lower;
+ } else {
+ assert (upper == upparent->left);
+ upparent->left = lower;
+ }
+
+ lower->right = upper;
+ upper->parent = lower;
+}
+
+/*
+ * Do a postorder traversal of the tree rooted at the specified
+ * node and free everything under it. Used by dict_free().
+ */
+
+static void free_nodes(dict_t *dict, dnode_t *node, dnode_t *nil)
+{
+ if (node == nil)
+ return;
+ free_nodes(dict, node->left, nil);
+ free_nodes(dict, node->right, nil);
+ dict->freenode(node, dict->context);
+}
+
+/*
+ * This procedure performs a verification that the given subtree is a binary
+ * search tree. It performs an inorder traversal of the tree using the
+ * dict_next() successor function, verifying that the key of each node is
+ * strictly lower than that of its successor, if duplicates are not allowed,
+ * or lower or equal if duplicates are allowed. This function is used for
+ * debugging purposes.
+ */
+
+static int verify_bintree(dict_t *dict)
+{
+ dnode_t *first, *next;
+
+ first = dict_first(dict);
+
+ if (dict->dupes) {
+ while (first && (next = dict_next(dict, first))) {
+#ifdef BE_QSORT_COMPATIBLE
+ if (dict->compare(&first->key, &next->key) > 0)
+ return 0;
+#else
+ if (dict->compare(first->key, next->key) > 0)
+ return 0;
+#endif
+ first = next;
+ }
+ } else {
+ while (first && (next = dict_next(dict, first))) {
+#ifdef BE_QSORT_COMPATIBLE
+ if (dict->compare(&first->key, &next->key) >= 0)
+ return 0;
+#else
+ if (dict->compare(first->key, next->key) >= 0)
+ return 0;
+#endif
+ first = next;
+ }
+ }
+ return 1;
+}
+
+
+/*
+ * This function recursively verifies that the given binary subtree satisfies
+ * three of the red black properties. It checks that every red node has only
+ * black children. It makes sure that each node is either red or black. And it
+ * checks that every path has the same count of black nodes from root to leaf.
+ * It returns the blackheight of the given subtree; this allows blackheights to
+ * be computed recursively and compared for left and right siblings for
+ * mismatches. It does not check for every nil node being black, because there
+ * is only one sentinel nil node. The return value of this function is the
+ * black height of the subtree rooted at the node ``root'', or zero if the
+ * subtree is not red-black.
+ */
+
+static unsigned int verify_redblack(dnode_t *nil, dnode_t *root)
+{
+ unsigned height_left, height_right;
+
+ if (root != nil) {
+ height_left = verify_redblack(nil, root->left);
+ height_right = verify_redblack(nil, root->right);
+ if (height_left == 0 || height_right == 0)
+ return 0;
+ if (height_left != height_right)
+ return 0;
+ if (root->color == dnode_red) {
+ if (root->left->color != dnode_black)
+ return 0;
+ if (root->right->color != dnode_black)
+ return 0;
+ return height_left;
+ }
+ if (root->color != dnode_black)
+ return 0;
+ return height_left + 1;
+ }
+ return 1;
+}
+
+/*
+ * Compute the actual count of nodes by traversing the tree and
+ * return it. This could be compared against the stored count to
+ * detect a mismatch.
+ */
+
+static dictcount_t verify_node_count(dnode_t *nil, dnode_t *root)
+{
+ if (root == nil)
+ return 0;
+ else
+ return 1 + verify_node_count(nil, root->left)
+ + verify_node_count(nil, root->right);
+}
+
+/*
+ * Verify that the tree contains the given node. This is done by
+ * traversing all of the nodes and comparing their pointers to the
+ * given pointer. Returns 1 if the node is found, otherwise
+ * returns zero. It is intended for debugging purposes.
+ */
+
+static int verify_dict_has_node(dnode_t *nil, dnode_t *root, dnode_t *node)
+{
+ if (root != nil) {
+ return root == node
+ || verify_dict_has_node(nil, root->left, node)
+ || verify_dict_has_node(nil, root->right, node);
+ }
+ return 0;
+}
+
+
+/*
+ * Dynamically allocate and initialize a dictionary object.
+ */
+
+dict_t *dict_create(dictcount_t maxcount, dict_comp_t comp)
+{
+ dict_t *new = malloc(sizeof *new);
+
+ if (new) {
+ new->compare = comp;
+ new->allocnode = dnode_alloc;
+ new->freenode = dnode_free;
+ new->context = NULL;
+ new->nodecount = 0;
+ new->maxcount = maxcount;
+ new->nilnode.left = &new->nilnode;
+ new->nilnode.right = &new->nilnode;
+ new->nilnode.parent = &new->nilnode;
+ new->nilnode.color = dnode_black;
+ new->dupes = 0;
+ }
+ return new;
+}
+
+/*
+ * Select a different set of node allocator routines.
+ */
+
+void dict_set_allocator(dict_t *dict, dnode_alloc_t al,
+ dnode_free_t fr, void *context)
+{
+ assert (dict_count(dict) == 0);
+ assert ((al == NULL && fr == NULL) || (al != NULL && fr != NULL));
+
+ dict->allocnode = al ? al : dnode_alloc;
+ dict->freenode = fr ? fr : dnode_free;
+ dict->context = context;
+}
+
+/*
+ * Free a dynamically allocated dictionary object. Removing the nodes
+ * from the tree before deleting it is required.
+ */
+
+void dict_destroy(dict_t *dict)
+{
+ assert (dict_isempty(dict));
+ free(dict);
+}
+
+/*
+ * Free all the nodes in the dictionary by using the dictionary's
+ * installed free routine. The dictionary is emptied.
+ */
+
+void dict_free_nodes(dict_t *dict)
+{
+ dnode_t *nil = dict_nil(dict), *root = dict_root(dict);
+ free_nodes(dict, root, nil);
+ dict->nodecount = 0;
+ dict->nilnode.left = &dict->nilnode;
+ dict->nilnode.right = &dict->nilnode;
+}
+
+/*
+ * Obsolescent function, equivalent to dict_free_nodes
+ */
+
+void dict_free(dict_t *dict)
+{
+#ifdef KAZLIB_OBSOLESCENT_DEBUG
+ assert ("call to obsolescent function dict_free()" && 0);
+#endif
+ dict_free_nodes(dict);
+}
+
+/*
+ * Initialize a user-supplied dictionary object.
+ */
+
+dict_t *dict_init(dict_t *dict, dictcount_t maxcount, dict_comp_t comp)
+{
+ dict->compare = comp;
+ dict->allocnode = dnode_alloc;
+ dict->freenode = dnode_free;
+ dict->context = NULL;
+ dict->nodecount = 0;
+ dict->maxcount = maxcount;
+ dict->nilnode.left = &dict->nilnode;
+ dict->nilnode.right = &dict->nilnode;
+ dict->nilnode.parent = &dict->nilnode;
+ dict->nilnode.color = dnode_black;
+ dict->dupes = 0;
+ return dict;
+}
+
+/*
+ * Initialize a dictionary in the likeness of another dictionary
+ */
+
+void dict_init_like(dict_t *dict, const dict_t *template)
+{
+ dict->compare = template->compare;
+ dict->allocnode = template->allocnode;
+ dict->freenode = template->freenode;
+ dict->context = template->context;
+ dict->nodecount = 0;
+ dict->maxcount = template->maxcount;
+ dict->nilnode.left = &dict->nilnode;
+ dict->nilnode.right = &dict->nilnode;
+ dict->nilnode.parent = &dict->nilnode;
+ dict->nilnode.color = dnode_black;
+ dict->dupes = template->dupes;
+
+ assert (dict_similar(dict, template));
+}
+
+/*
+ * Remove all nodes from the dictionary (without freeing them in any way).
+ */
+
+static void dict_clear(dict_t *dict)
+{
+ dict->nodecount = 0;
+ dict->nilnode.left = &dict->nilnode;
+ dict->nilnode.right = &dict->nilnode;
+ dict->nilnode.parent = &dict->nilnode;
+ assert (dict->nilnode.color == dnode_black);
+}
+
+
+/*
+ * Verify the integrity of the dictionary structure. This is provided for
+ * debugging purposes, and should be placed in assert statements. Just because
+ * this function succeeds doesn't mean that the tree is not corrupt. Certain
+ * corruptions in the tree may simply cause undefined behavior.
+ */
+
+int dict_verify(dict_t *dict)
+{
+ dnode_t *nil = dict_nil(dict), *root = dict_root(dict);
+
+ /* check that the sentinel node and root node are black */
+ if (root->color != dnode_black)
+ return(0 * fprintf(stderr, "dict_verify()-- Root node not black!\n"));
+ if (nil->color != dnode_black)
+ return(0 * fprintf(stderr, "dict_verify()-- Nil node not black!\n"));
+ if (nil->right != nil)
+ return(0 * fprintf(stderr, "dict_verify()-- Nul->right not Nil!\n"));
+ /* nil->left is the root node; check that its parent pointer is nil */
+ if (nil->left->parent != nil)
+ return(0 * fprintf(stderr, "dict_verify()-- Nul->left->parent is not Nil!\n"));
+ /* perform a weak test that the tree is a binary search tree */
+ if (!verify_bintree(dict))
+ return(0 * fprintf(stderr, "dict_verify()-- Not a binary search tree!\n"));
+ /* verify that the tree is a red-black tree */
+ if (!verify_redblack(nil, root))
+ return(0 * fprintf(stderr, "dict_verify()-- Not a red-black tree!\n"));
+ if (verify_node_count(nil, root) != dict_count(dict))
+ return(0 * fprintf(stderr, "dict_verify()-- Node count is wrong!\n"));
+ return 1;
+}
+
+/*
+ * Determine whether two dictionaries are similar: have the same comparison and
+ * allocator functions, and same status as to whether duplicates are allowed.
+ */
+
+int dict_similar(const dict_t *left, const dict_t *right)
+{
+ if (left->compare != right->compare)
+ return 0;
+
+ if (left->allocnode != right->allocnode)
+ return 0;
+
+ if (left->freenode != right->freenode)
+ return 0;
+
+ if (left->context != right->context)
+ return 0;
+
+ if (left->dupes != right->dupes)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Locate a node in the dictionary having the given key.
+ * If the node is not found, a null a pointer is returned (rather than
+ * a pointer that dictionary's nil sentinel node), otherwise a pointer to the
+ * located node is returned.
+ */
+
+dnode_t *dict_lookup(dict_t *dict, const void *key)
+{
+ dnode_t *root = dict_root(dict);
+ dnode_t *nil = dict_nil(dict);
+ dnode_t *saved;
+ int result;
+
+ /* simple binary search adapted for trees that contain duplicate keys */
+
+ while (root != nil) {
+#ifdef BE_QSORT_COMPATIBLE
+ result = dict->compare(&key, &root->key);
+#else
+ result = dict->compare(key, root->key);
+#endif
+ if (result < 0)
+ root = root->left;
+ else if (result > 0)
+ root = root->right;
+ else {
+ if (!dict->dupes) { /* no duplicates, return match */
+ return root;
+ } else { /* could be dupes, find leftmost one */
+ do {
+ saved = root;
+ root = root->left;
+#ifdef BE_QSORT_COMPATIBLE
+ while (root != nil && dict->compare(&key, &root->key))
+ root = root->right;
+#else
+ while (root != nil && dict->compare(key, root->key))
+ root = root->right;
+#endif
+ } while (root != nil);
+ return saved;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Look for the node corresponding to the lowest key that is equal to or
+ * greater than the given key. If there is no such node, return null.
+ */
+
+dnode_t *dict_lower_bound(dict_t *dict, const void *key)
+{
+ dnode_t *root = dict_root(dict);
+ dnode_t *nil = dict_nil(dict);
+ dnode_t *tentative = 0;
+
+ while (root != nil) {
+#ifdef BE_QSORT_COMPATIBLE
+ int result = dict->compare(&key, &root->key);
+#else
+ int result = dict->compare(key, root->key);
+#endif
+
+ if (result > 0) {
+ root = root->right;
+ } else if (result < 0) {
+ tentative = root;
+ root = root->left;
+ } else {
+ if (!dict->dupes) {
+ return root;
+ } else {
+ tentative = root;
+ root = root->left;
+ }
+ }
+ }
+
+ return tentative;
+}
+
+/*
+ * Look for the node corresponding to the greatest key that is equal to or
+ * lower than the given key. If there is no such node, return null.
+ */
+
+dnode_t *dict_upper_bound(dict_t *dict, const void *key)
+{
+ dnode_t *root = dict_root(dict);
+ dnode_t *nil = dict_nil(dict);
+ dnode_t *tentative = 0;
+
+ while (root != nil) {
+#ifdef BE_QSORT_COMPATIBLE
+ int result = dict->compare(&key, &root->key);
+#else
+ int result = dict->compare(key, root->key);
+#endif
+
+ if (result < 0) {
+ root = root->left;
+ } else if (result > 0) {
+ tentative = root;
+ root = root->right;
+ } else {
+ if (!dict->dupes) {
+ return root;
+ } else {
+ tentative = root;
+ root = root->right;
+ }
+ }
+ }
+
+ return tentative;
+}
+
+/*
+ * Insert a node into the dictionary. The node should have been
+ * initialized with a data field. All other fields are ignored.
+ * The behavior is undefined if the user attempts to insert into
+ * a dictionary that is already full (for which the dict_isfull()
+ * function returns true).
+ */
+
+void dict_insert(dict_t *dict, dnode_t *node, const void *key)
+{
+ dnode_t *where = dict_root(dict), *nil = dict_nil(dict);
+ dnode_t *parent = nil, *uncle, *grandpa;
+ int result = -1;
+
+ node->key = key;
+
+ assert (!dict_isfull(dict));
+ assert (!dict_contains(dict, node));
+ assert (!dnode_is_in_a_dict(node));
+
+ /* basic binary tree insert */
+
+ while (where != nil) {
+ parent = where;
+#ifdef BE_QSORT_COMPATIBLE
+ result = dict->compare(&key, &where->key);
+#else
+ result = dict->compare(key, where->key);
+#endif
+ /* trap attempts at duplicate key insertion unless it's explicitly allowed */
+ assert (dict->dupes || result != 0);
+ if (result < 0)
+ where = where->left;
+ else
+ where = where->right;
+ }
+
+ assert (where == nil);
+
+ if (result < 0)
+ parent->left = node;
+ else
+ parent->right = node;
+
+ node->parent = parent;
+ node->left = nil;
+ node->right = nil;
+
+ dict->nodecount++;
+
+ /* red black adjustments */
+
+ node->color = dnode_red;
+
+ while (parent->color == dnode_red) {
+ grandpa = parent->parent;
+ if (parent == grandpa->left) {
+ uncle = grandpa->right;
+ if (uncle->color == dnode_red) { /* red parent, red uncle */
+ parent->color = dnode_black;
+ uncle->color = dnode_black;
+ grandpa->color = dnode_red;
+ node = grandpa;
+ parent = grandpa->parent;
+ } else { /* red parent, black uncle */
+ if (node == parent->right) {
+ rotate_left(parent);
+ parent = node;
+ assert (grandpa == parent->parent);
+ /* rotation between parent and child preserves grandpa */
+ }
+ parent->color = dnode_black;
+ grandpa->color = dnode_red;
+ rotate_right(grandpa);
+ break;
+ }
+ } else { /* symmetric cases: parent == parent->parent->right */
+ uncle = grandpa->left;
+ if (uncle->color == dnode_red) {
+ parent->color = dnode_black;
+ uncle->color = dnode_black;
+ grandpa->color = dnode_red;
+ node = grandpa;
+ parent = grandpa->parent;
+ } else {
+ if (node == parent->left) {
+ rotate_right(parent);
+ parent = node;
+ assert (grandpa == parent->parent);
+ }
+ parent->color = dnode_black;
+ grandpa->color = dnode_red;
+ rotate_left(grandpa);
+ break;
+ }
+ }
+ }
+
+ dict_root(dict)->color = dnode_black;
+
+ assert (dict_verify(dict));
+}
+
+/*
+ * Delete the given node from the dictionary. If the given node does not belong
+ * to the given dictionary, undefined behavior results. A pointer to the
+ * deleted node is returned.
+ */
+
+dnode_t *dict_delete(dict_t *dict, dnode_t *delete)
+{
+ dnode_t *nil = dict_nil(dict), *child, *delparent = delete->parent;
+
+ /* basic deletion */
+
+ assert (!dict_isempty(dict));
+ assert (dict_contains(dict, delete));
+
+ /*
+ * If the node being deleted has two children, then we replace it with its
+ * successor (i.e. the leftmost node in the right subtree.) By doing this,
+ * we avoid the traditional algorithm under which the successor's key and
+ * value *only* move to the deleted node and the successor is spliced out
+ * from the tree. We cannot use this approach because the user may hold
+ * pointers to the successor, or nodes may be inextricably tied to some
+ * other structures by way of embedding, etc. So we must splice out the
+ * node we are given, not some other node, and must not move contents from
+ * one node to another behind the user's back.
+ */
+
+ if (delete->left != nil && delete->right != nil) {
+ dnode_t *next = dict_next(dict, delete);
+ dnode_t *nextparent = next->parent;
+ dnode_color_t nextcolor = next->color;
+
+ assert (next != nil);
+ assert (next->parent != nil);
+ assert (next->left == nil);
+
+ /*
+ * First, splice out the successor from the tree completely, by
+ * moving up its right child into its place.
+ */
+
+ child = next->right;
+ child->parent = nextparent;
+
+ if (nextparent->left == next) {
+ nextparent->left = child;
+ } else {
+ assert (nextparent->right == next);
+ nextparent->right = child;
+ }
+
+ /*
+ * Now that the successor has been extricated from the tree, install it
+ * in place of the node that we want deleted.
+ */
+
+ next->parent = delparent;
+ next->left = delete->left;
+ next->right = delete->right;
+ next->left->parent = next;
+ next->right->parent = next;
+ next->color = delete->color;
+ delete->color = nextcolor;
+
+ if (delparent->left == delete) {
+ delparent->left = next;
+ } else {
+ assert (delparent->right == delete);
+ delparent->right = next;
+ }
+
+ } else {
+ assert (delete != nil);
+ assert (delete->left == nil || delete->right == nil);
+
+ child = (delete->left != nil) ? delete->left : delete->right;
+
+ child->parent = delparent = delete->parent;
+
+ if (delete == delparent->left) {
+ delparent->left = child;
+ } else {
+ assert (delete == delparent->right);
+ delparent->right = child;
+ }
+ }
+
+ delete->parent = NULL;
+ delete->right = NULL;
+ delete->left = NULL;
+
+ dict->nodecount--;
+
+ assert (verify_bintree(dict));
+
+ /* red-black adjustments */
+
+ if (delete->color == dnode_black) {
+ dnode_t *parent, *sister;
+
+ dict_root(dict)->color = dnode_red;
+
+ while (child->color == dnode_black) {
+ parent = child->parent;
+ if (child == parent->left) {
+ sister = parent->right;
+ assert (sister != nil);
+ if (sister->color == dnode_red) {
+ sister->color = dnode_black;
+ parent->color = dnode_red;
+ rotate_left(parent);
+ sister = parent->right;
+ assert (sister != nil);
+ }
+ if (sister->left->color == dnode_black
+ && sister->right->color == dnode_black) {
+ sister->color = dnode_red;
+ child = parent;
+ } else {
+ if (sister->right->color == dnode_black) {
+ assert (sister->left->color == dnode_red);
+ sister->left->color = dnode_black;
+ sister->color = dnode_red;
+ rotate_right(sister);
+ sister = parent->right;
+ assert (sister != nil);
+ }
+ sister->color = parent->color;
+ sister->right->color = dnode_black;
+ parent->color = dnode_black;
+ rotate_left(parent);
+ break;
+ }
+ } else { /* symmetric case: child == child->parent->right */
+ assert (child == parent->right);
+ sister = parent->left;
+ assert (sister != nil);
+ if (sister->color == dnode_red) {
+ sister->color = dnode_black;
+ parent->color = dnode_red;
+ rotate_right(parent);
+ sister = parent->left;
+ assert (sister != nil);
+ }
+ if (sister->right->color == dnode_black
+ && sister->left->color == dnode_black) {
+ sister->color = dnode_red;
+ child = parent;
+ } else {
+ if (sister->left->color == dnode_black) {
+ assert (sister->right->color == dnode_red);
+ sister->right->color = dnode_black;
+ sister->color = dnode_red;
+ rotate_left(sister);
+ sister = parent->left;
+ assert (sister != nil);
+ }
+ sister->color = parent->color;
+ sister->left->color = dnode_black;
+ parent->color = dnode_black;
+ rotate_right(parent);
+ break;
+ }
+ }
+ }
+
+ child->color = dnode_black;
+ dict_root(dict)->color = dnode_black;
+ }
+
+ assert (dict_verify(dict));
+
+ return delete;
+}
+
+/*
+ * Allocate a node using the dictionary's allocator routine, give it
+ * the data item.
+ */
+
+int dict_alloc_insert(dict_t *dict, const void *key, void *data)
+{
+ dnode_t *node = dict->allocnode(dict->context);
+
+ if (node) {
+ dnode_init(node, data);
+ dict_insert(dict, node, key);
+ return 1;
+ }
+ return 0;
+}
+
+void dict_delete_free(dict_t *dict, dnode_t *node)
+{
+ dict_delete(dict, node);
+ dict->freenode(node, dict->context);
+}
+
+/*
+ * Return the node with the lowest (leftmost) key. If the dictionary is empty
+ * (that is, dict_isempty(dict) returns 1) a null pointer is returned.
+ */
+
+dnode_t *dict_first(dict_t *dict)
+{
+ dnode_t *nil = dict_nil(dict), *root = dict_root(dict), *left;
+
+ if (root != nil)
+ while ((left = root->left) != nil)
+ root = left;
+
+ return (root == nil) ? NULL : root;
+}
+
+/*
+ * Return the node with the highest (rightmost) key. If the dictionary is empty
+ * (that is, dict_isempty(dict) returns 1) a null pointer is returned.
+ */
+
+dnode_t *dict_last(dict_t *dict)
+{
+ dnode_t *nil = dict_nil(dict), *root = dict_root(dict), *right;
+
+ if (root != nil)
+ while ((right = root->right) != nil)
+ root = right;
+
+ return (root == nil) ? NULL : root;
+}
+
+/*
+ * Return the given node's successor node---the node which has the
+ * next key in the the left to right ordering. If the node has
+ * no successor, a null pointer is returned rather than a pointer to
+ * the nil node.
+ */
+
+dnode_t *dict_next(dict_t *dict, dnode_t *curr)
+{
+ dnode_t *nil = dict_nil(dict), *parent, *left;
+
+ if (curr->right != nil) {
+ curr = curr->right;
+ while ((left = curr->left) != nil)
+ curr = left;
+ return curr;
+ }
+
+ parent = curr->parent;
+
+ while (parent != nil && curr == parent->right) {
+ curr = parent;
+ parent = curr->parent;
+ }
+
+ return (parent == nil) ? NULL : parent;
+}
+
+/*
+ * Return the given node's predecessor, in the key order.
+ * The nil sentinel node is returned if there is no predecessor.
+ */
+
+dnode_t *dict_prev(dict_t *dict, dnode_t *curr)
+{
+ dnode_t *nil = dict_nil(dict), *parent, *right;
+
+ if (curr->left != nil) {
+ curr = curr->left;
+ while ((right = curr->right) != nil)
+ curr = right;
+ return curr;
+ }
+
+ parent = curr->parent;
+
+ while (parent != nil && curr == parent->left) {
+ curr = parent;
+ parent = curr->parent;
+ }
+
+ return (parent == nil) ? NULL : parent;
+}
+
+void dict_allow_dupes(dict_t *dict)
+{
+ dict->dupes = 1;
+}
+
+#undef dict_count
+#undef dict_isempty
+#undef dict_isfull
+#undef dnode_get
+#undef dnode_put
+#undef dnode_getkey
+
+dictcount_t dict_count(dict_t *dict)
+{
+ return dict->nodecount;
+}
+
+int dict_isempty(dict_t *dict)
+{
+ return dict->nodecount == 0;
+}
+
+int dict_isfull(dict_t *dict)
+{
+ return dict->nodecount == dict->maxcount;
+}
+
+int dict_contains(dict_t *dict, dnode_t *node)
+{
+ return verify_dict_has_node(dict_nil(dict), dict_root(dict), node);
+}
+
+static dnode_t *dnode_alloc(void *context)
+{
+ return malloc(sizeof *dnode_alloc(NULL));
+}
+
+static void dnode_free(dnode_t *node, void *context)
+{
+ free(node);
+}
+
+dnode_t *dnode_create(void *data)
+{
+ dnode_t *new = malloc(sizeof *new);
+ if (new) {
+ new->data = data;
+ new->parent = NULL;
+ new->left = NULL;
+ new->right = NULL;
+ }
+ return new;
+}
+
+dnode_t *dnode_init(dnode_t *dnode, void *data)
+{
+ dnode->data = data;
+ dnode->parent = NULL;
+ dnode->left = NULL;
+ dnode->right = NULL;
+ return dnode;
+}
+
+void dnode_destroy(dnode_t *dnode)
+{
+ assert (!dnode_is_in_a_dict(dnode));
+ free(dnode);
+}
+
+void *dnode_get(dnode_t *dnode)
+{
+ return dnode->data;
+}
+
+const void *dnode_getkey(dnode_t *dnode)
+{
+ return dnode->key;
+}
+
+void dnode_put(dnode_t *dnode, void *data)
+{
+ dnode->data = data;
+}
+
+int dnode_is_in_a_dict(dnode_t *dnode)
+{
+ return (dnode->parent && dnode->left && dnode->right);
+}
+
+void dict_process(dict_t *dict, void *context, dnode_process_t function)
+{
+ dnode_t *node = dict_first(dict), *next;
+
+ while (node != NULL) {
+ /* check for callback function deleting */
+ /* the next node from under us */
+ assert (dict_contains(dict, node));
+ next = dict_next(dict, node);
+ function(dict, node, context);
+ node = next;
+ }
+}
+
+static void load_begin_internal(dict_load_t *load, dict_t *dict)
+{
+ load->dictptr = dict;
+ load->nilnode.left = &load->nilnode;
+ load->nilnode.right = &load->nilnode;
+}
+
+void dict_load_begin(dict_load_t *load, dict_t *dict)
+{
+ assert (dict_isempty(dict));
+ load_begin_internal(load, dict);
+}
+
+void dict_load_next(dict_load_t *load, dnode_t *newnode, const void *key)
+{
+ dict_t *dict = load->dictptr;
+ dnode_t *nil = &load->nilnode;
+
+ assert (!dnode_is_in_a_dict(newnode));
+ assert (dict->nodecount < dict->maxcount);
+
+#ifndef NDEBUG
+ if (dict->nodecount > 0) {
+#ifdef BE_QSORT_COMPATIBLE
+ if (dict->dupes)
+ assert (dict->compare(&nil->left->key, &key) <= 0);
+ else
+ assert (dict->compare(&nil->left->key, &key) < 0);
+#else
+ if (dict->dupes)
+ assert (dict->compare(nil->left->key, key) <= 0);
+ else
+ assert (dict->compare(nil->left->key, key) < 0);
+#endif
+ }
+#endif
+
+ newnode->key = key;
+ nil->right->left = newnode;
+ nil->right = newnode;
+ newnode->left = nil;
+ dict->nodecount++;
+}
+
+void dict_load_end(dict_load_t *load)
+{
+ dict_t *dict = load->dictptr;
+ dnode_t *tree[DICT_DEPTH_MAX] = { 0 };
+ dnode_t *curr, *dictnil = dict_nil(dict), *loadnil = &load->nilnode, *next;
+ dnode_t *complete = 0;
+ dictcount_t fullcount = DICTCOUNT_T_MAX, nodecount = dict->nodecount;
+ dictcount_t botrowcount;
+ unsigned baselevel = 0, level = 0, i;
+
+ assert (dnode_red == 0 && dnode_black == 1);
+
+ while (fullcount >= nodecount && fullcount)
+ fullcount >>= 1;
+
+ botrowcount = nodecount - fullcount;
+
+ for (curr = loadnil->left; curr != loadnil; curr = next) {
+ next = curr->left;
+
+ if (complete == NULL && botrowcount-- == 0) {
+ assert (baselevel == 0);
+ assert (level == 0);
+ baselevel = level = 1;
+ complete = tree[0];
+
+ if (complete != 0) {
+ tree[0] = 0;
+ complete->right = dictnil;
+ while (tree[level] != 0) {
+ tree[level]->right = complete;
+ complete->parent = tree[level];
+ complete = tree[level];
+ tree[level++] = 0;
+ }
+ }
+ }
+
+ if (complete == NULL) {
+ curr->left = dictnil;
+ curr->right = dictnil;
+ curr->color = level % 2;
+ complete = curr;
+
+ assert (level == baselevel);
+ while (tree[level] != 0) {
+ tree[level]->right = complete;
+ complete->parent = tree[level];
+ complete = tree[level];
+ tree[level++] = 0;
+ }
+ } else {
+ curr->left = complete;
+ curr->color = (level + 1) % 2;
+ complete->parent = curr;
+ tree[level] = curr;
+ complete = 0;
+ level = baselevel;
+ }
+ }
+
+ if (complete == NULL)
+ complete = dictnil;
+
+ for (i = 0; i < DICT_DEPTH_MAX; i++) {
+ if (tree[i] != 0) {
+ tree[i]->right = complete;
+ complete->parent = tree[i];
+ complete = tree[i];
+ }
+ }
+
+ dictnil->color = dnode_black;
+ dictnil->right = dictnil;
+ complete->parent = dictnil;
+ complete->color = dnode_black;
+ dict_root(dict) = complete;
+
+ assert (dict_verify(dict));
+}
+
+void dict_merge(dict_t *dest, dict_t *source)
+{
+ dict_load_t load;
+ dnode_t *leftnode = dict_first(dest), *rightnode = dict_first(source);
+
+ assert (dict_similar(dest, source));
+
+ if (source == dest)
+ return;
+
+ dest->nodecount = 0;
+ load_begin_internal(&load, dest);
+
+ for (;;) {
+ if (leftnode != NULL && rightnode != NULL) {
+#ifdef BE_QSORT_COMPATIBLE
+ if (dest->compare(&leftnode->key, &rightnode->key) < 0)
+ goto copyleft;
+ else
+ goto copyright;
+#else
+ if (dest->compare(leftnode->key, rightnode->key) < 0)
+ goto copyleft;
+ else
+ goto copyright;
+#endif
+ } else if (leftnode != NULL) {
+ goto copyleft;
+ } else if (rightnode != NULL) {
+ goto copyright;
+ } else {
+ assert (leftnode == NULL && rightnode == NULL);
+ break;
+ }
+
+ copyleft:
+ {
+ dnode_t *next = dict_next(dest, leftnode);
+ #ifndef NDEBUG
+ leftnode->left = NULL; /* suppress assertion in dict_load_next */
+ #endif
+ dict_load_next(&load, leftnode, leftnode->key);
+ leftnode = next;
+ continue;
+ }
+
+ copyright:
+ {
+ dnode_t *next = dict_next(source, rightnode);
+#ifndef NDEBUG
+ rightnode->left = NULL;
+#endif
+ dict_load_next(&load, rightnode, rightnode->key);
+ rightnode = next;
+ continue;
+ }
+ }
+
+ dict_clear(source);
+ dict_load_end(&load);
+}
diff --git a/libutil/kazlib/dict.h b/libutil/kazlib/dict.h
new file mode 100644
index 0000000..2bab634
--- /dev/null
+++ b/libutil/kazlib/dict.h
@@ -0,0 +1,142 @@
+/*
+ * Dictionary Abstract Data Type
+ * Copyright (C) 1997 Kaz Kylheku <kaz at ashi.footprints.net>
+ *
+ * Free Software License:
+ *
+ * All rights are reserved by the author, with the following exceptions:
+ * Permission is granted to freely reproduce and distribute this software,
+ * possibly in exchange for a fee, provided that this copyright notice appears
+ * intact. Permission is also granted to adapt this software to produce
+ * derivative works, as long as the modified versions carry this copyright
+ * notice and additional notices stating that the work has been modified.
+ * This source code may be translated into executable form and incorporated
+ * into proprietary software; there is no requirement for such software to
+ * contain a copyright notice related to this source.
+ *
+ */
+
+#ifndef DICT_H
+#define DICT_H
+
+#include <limits.h>
+#ifdef KAZLIB_SIDEEFFECT_DEBUG
+#include "sfx.h"
+#endif
+
+/*
+ * Blurb for inclusion into C++ translation units
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned long dictcount_t;
+#define DICTCOUNT_T_MAX ULONG_MAX
+
+/*
+ * The dictionary is implemented as a red-black tree
+ */
+
+typedef enum { dnode_red, dnode_black } dnode_color_t;
+
+typedef struct dnode_t {
+#if defined(DICT_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+ struct dnode_t *dict_left;
+ struct dnode_t *dict_right;
+ struct dnode_t *dict_parent;
+ dnode_color_t dict_color;
+ const void *dict_key;
+ void *dict_data;
+#else
+ int dict_dummy;
+#endif
+} dnode_t;
+
+typedef int (*dict_comp_t)(const void *, const void *);
+typedef dnode_t *(*dnode_alloc_t)(void *);
+typedef void (*dnode_free_t)(dnode_t *, void *);
+
+typedef struct dict_t {
+#if defined(DICT_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+ dnode_t dict_nilnode;
+ dictcount_t dict_nodecount;
+ dictcount_t dict_maxcount;
+ dict_comp_t dict_compare;
+ dnode_alloc_t dict_allocnode;
+ dnode_free_t dict_freenode;
+ void *dict_context;
+ int dict_dupes;
+#else
+ int dict_dummmy;
+#endif
+} dict_t;
+
+typedef void (*dnode_process_t)(dict_t *, dnode_t *, void *);
+
+typedef struct dict_load_t {
+#if defined(DICT_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+ dict_t *dict_dictptr;
+ dnode_t dict_nilnode;
+#else
+ int dict_dummmy;
+#endif
+} dict_load_t;
+
+extern dict_t *dict_create(dictcount_t, dict_comp_t);
+extern void dict_set_allocator(dict_t *, dnode_alloc_t, dnode_free_t, void *);
+extern void dict_destroy(dict_t *);
+extern void dict_free_nodes(dict_t *);
+extern void dict_free(dict_t *);
+extern dict_t *dict_init(dict_t *, dictcount_t, dict_comp_t);
+extern void dict_init_like(dict_t *, const dict_t *);
+extern int dict_verify(dict_t *);
+extern int dict_similar(const dict_t *, const dict_t *);
+extern dnode_t *dict_lookup(dict_t *, const void *);
+extern dnode_t *dict_lower_bound(dict_t *, const void *);
+extern dnode_t *dict_upper_bound(dict_t *, const void *);
+extern void dict_insert(dict_t *, dnode_t *, const void *);
+extern dnode_t *dict_delete(dict_t *, dnode_t *);
+extern int dict_alloc_insert(dict_t *, const void *, void *);
+extern void dict_delete_free(dict_t *, dnode_t *);
+extern dnode_t *dict_first(dict_t *);
+extern dnode_t *dict_last(dict_t *);
+extern dnode_t *dict_next(dict_t *, dnode_t *);
+extern dnode_t *dict_prev(dict_t *, dnode_t *);
+extern dictcount_t dict_count(dict_t *);
+extern int dict_isempty(dict_t *);
+extern int dict_isfull(dict_t *);
+extern int dict_contains(dict_t *, dnode_t *);
+extern void dict_allow_dupes(dict_t *);
+extern int dnode_is_in_a_dict(dnode_t *);
+extern dnode_t *dnode_create(void *);
+extern dnode_t *dnode_init(dnode_t *, void *);
+extern void dnode_destroy(dnode_t *);
+extern void *dnode_get(dnode_t *);
+extern const void *dnode_getkey(dnode_t *);
+extern void dnode_put(dnode_t *, void *);
+extern void dict_process(dict_t *, void *, dnode_process_t);
+extern void dict_load_begin(dict_load_t *, dict_t *);
+extern void dict_load_next(dict_load_t *, dnode_t *, const void *);
+extern void dict_load_end(dict_load_t *);
+extern void dict_merge(dict_t *, dict_t *);
+
+#if defined(DICT_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+#ifdef KAZLIB_SIDEEFFECT_DEBUG
+#define dict_isfull(D) (SFX_CHECK(D)->dict_nodecount == (D)->dict_maxcount)
+#else
+#define dict_isfull(D) ((D)->dict_nodecount == (D)->dict_maxcount)
+#endif
+#define dict_count(D) ((D)->dict_nodecount)
+#define dict_isempty(D) ((D)->dict_nodecount == 0)
+#define dnode_get(N) ((N)->dict_data)
+#define dnode_getkey(N) ((N)->dict_key)
+#define dnode_put(N, X) ((N)->dict_data = (X))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libutil/kazlib/docs/CHANGES b/libutil/kazlib/docs/CHANGES
new file mode 100644
index 0000000..3c949eb
--- /dev/null
+++ b/libutil/kazlib/docs/CHANGES
@@ -0,0 +1,290 @@
+New in 1.20
+
+ 1. Bugfix in except.h. Modified non-volatile auto variables were
+ being accessed after longjmp.
+
+New in 1.19
+
+ 1. Rewrite of broken dict_free.
+ 2. Fixed embarassing build breakages that accidentally went into 1.18
+ 3. Function hash_scan_delete_free renamed to hash_scan_delfree to be
+ distinct from hash_scan_delete in the first 14 characters.
+ 4. To resolve inconsistencies between hash_free and dict_free,
+ and a difference between the actual behavior of hash_free and
+ the documented behavior, these two functions are marked obsolescent.
+ The functions dict_free_nodes and hash_free_nodes are provided.
+ The obsolescent functions continue to work as before, for now.
+ 5. Documentation of hash_free is fixed to say that it also subjects
+ the hash to hash_destroy, which is what the implementation does.
+ 6. Documentation states what release it is for.
+
+New in 1.18
+
+ 1. Error in assert expression in list_merge fixed.
+ 2. Semantics of list_merge extended to allow list to be merged
+ onto itself (which is a noop).
+ 3. Clarified interface specification of list_transfer and list_extract;
+ the source and destination list may be the same object.
+ 4. New functions:
+ dict_init_like: create a dictionary similar to another one;
+ dict_similar: determine whether two dictionaries are similar;
+ dict_merge: merge contents of one dictionary to another.
+ 5. Dictionary test main can juggle multiple dictionaries, and test
+ dict_merge.
+ 6. If a hash node is inserted into some hash, it is a now a constraint
+ violation to insert it again into some hash.
+ 7. The hash_scan_delete_free function has been implemented; it is to
+ hash_scan_delete what hash_delete_free is to hash_delete.
+
+New in 1.17
+
+ Carl van Tast <vanTast at netway.at>:
+ 1. Removed references to ``safe malloc'' from some comments.
+ 2. Swapped ``allowed'' and ``not allowed'' in comment to
+ verify_bintree.
+ 3. Fixed comment to list_next: this function never returns the
+ sentinel.
+ 4. lnode_pool_init: nodes[i].prev = nodes instead of nodes + 1. This
+ saves one or two CPU cycles :-) and it gives a valid address even
+ if we have a (somewhat pathological) pool with just one element.
+
+ Kaz:
+ 5. Dropped extra parameter from tree rotation functions in dict.c. Should
+ shave a few cycles.
+ 6. Fixed error in the duplicate key iteration idiom example in the
+ documentation (see the section on dict_upper_bound).
+ 7. Forgotten #include <string.h> added to hash.c
+
+New in 1.16
+
+ 1. Added an interface for loading the contents of a dictionary from an
+ ordered sequence. This is done in O(n) time by a direct bottom-up
+ construction of the red-black tree, making it much faster than
+ the O(n log n) process of inserting each element.
+ 2. Miscellaneous cleanup: missing const qualifiers were added
+ to key pointer parameters, some incorrect comments fixed;
+ spelling errors corrected in documentation.
+
+New in 1.15
+
+ 1. Another potential exception handling memory leak fixed. This one
+ has to do with throwing an exception from within a try-catch region
+ in which an exception was just caught. The new exception replaces
+ the old without the old's dynamic memory being disposed of.
+ 2. Restrictions added on except_rethrow.
+ 3. Exception module must now be explicitly initialized with except_init.
+ 4. Structure members in exception header renamed to adhere to documented
+ namespace.
+ 5. The exwrap.[ch] source files are gone. There is support for memory
+ allocation with exception handling in except.c, which supports user
+ defined allocators.
+ 6. Three bugfixes to sfx parser. First, unary operators take a cast
+ expression, not a unary expression. Secondly, sizeof doesn't throw a syntax
+ error anymore on things that look like casts, but maybe are not.
+ Thirdly, empty parentheses weren't handled right in treatment of
+ ambiguous expressions, e.g. (a)() was declared a syntax error.
+ 7. Changed the representation of hash table chains. They are now
+ singly linked lists, which means that the overhead of managing
+ back pointers is gone. Only deletion is slightly more complicated
+ now because it has to search from the beginning of the chain.
+ [Rationale: this is okay, since chains are supposed to be short
+ in a hash table!]
+ 8. Rewritten test main() in list.c. It's now more like the others
+ with a menu. Previously it was essentially a file sorting program.
+ 9. New function: list_find. Exhaustively searches the list for a
+ matching entry, returns pointer to node if found.
+
+New in 1.14
+
+ 1. Got rid of some overbearing copyright restrictions. There is no need for
+ executables to contain copyright notices. In fact, there are no
+ restrictions on the use, or distribution in executable form.
+ 2. Tiny tweak in red-black fixup code of dict_insert.
+ 3. Keys in hash and dict are declared const void * now in all functions
+ rather than plain void *. This means that casts are no longer
+ necessary when calling insert or lookup functions with const
+ data as the key. But casts of the return value of hnode_getkey
+ or dnode_getkey may be required.
+ 4. Fixed compile breakage of except.c when posix thread support enabled.
+ 5. Side effect assertion interface now performs caching, to avoid
+ parsing the same expressions over and over again. Thus debugging with
+ KAZLIB_SIDEEFFECT_DEBUG incurs a smaller performance hit.
+ 6. Major bugfix to sfx expression parser. The function dealing with
+ disambiguating casts had to be rewritten to do more sophisticated
+ lookahead and backtracking. It all started with Mark Brady discovered
+ that (a++)+b was being incorrectly diagnosed as a syntax error.
+ 7. Added documentation. more examples for uses of dictionaries, and
+ exception handling. Some documentation about the internals
+ of exception handling added. Changed document format for narrower
+ margins, reducing page count and increasing readability.
+ 8. Bugfix in except_rethrow. It was freeing the dynamic data of the
+ exception even though it's not handled yet.
+
+New in 1.13
+
+ 1. Fixed some potential memory leaks in except.c.
+ 2. Finished all interface documentation. All that is left now
+ is to flesh out the implementation notes.
+ 3. Fixed a bug in POSIX threaded variant of except.c. Null
+ function pointer dereference in unhandled exception case.
+ 4. Macros beginning with E[A-Z] have been renamed to stay out
+ of space reserved for <errno.h>.
+ 5. Identifiers in exwrap.[ch] have been renamed from having
+ ex_ prefixed to having exwrap_ prefixes.
+
+New in 1.12
+
+ 1. COOL! New module for detecting side effects in C expressions.
+ 2. Serious bugfix in hash_init(). The computation of the initial hash
+ mask was completely botched up. Historically this code has seen little
+ testing because hashing over a user supplied table is not extendible.
+ Users of hash_create() are not affected.
+ 3. Tried to make computation of hash_val_t_bit more threadsafe. It should
+ be okay if writes to int objects are atomic, and concurrent writes of
+ the same int value to a given object are safe.
+ 4. Makefile renamed to Makefile.gcc. Makevile.vc added. The rename
+ is retroactive to all prior releases.
+ 5. OPAQUE_DEBUG becomes KAZLIB_OPAQUE_DEBUG and TEST_MAIN becomes
+ KAZLIB_TEST_MAIN. In general, macros that affect how the modules
+ build should be confined to a special namespace.
+ 6. New KAZLIB_SIDEEFFECT_DEBUG feature to enable diagnosis of side
+ effect expressions being passed to macros that evaluate their arguments
+ more than once.
+
+New in 1.11
+
+ 1. Improvements in experimental exception handling module:
+ except_throwf has been added which takes printf-like arguments;
+ except_checked_cleanup_pop has been added to provide a measure
+ of safety; there is now a way to pass arbitrary data from the throw site
+ to the catch.
+ 2. Improvements in dict_insert. A redundant call to the comparison function
+ has been eliminated, resulting in one fewer comparisons per insert
+ operation! Also a redundant test has been removed from the controlling
+ expression of the fixup loop, taking advantage of the fact that nil
+ is always black, and hence the root node always has a black parent.
+ 3. Small change in dict_delete. A test in the fixup loop has been eliminated
+ by temporarily coloring the root node red. See comment and diff between
+ dict.c revision 1.25 and 1.26.
+ 4. Test program blast.pl deletes keys out of order; to get in order
+ delete, initialize $factor_d to 1.
+
+New in 1.10
+
+ 1. The dict_init function now correctly initializes allocator-related
+ members of the dict structure.
+ 2. Tiny optimization in dict_lookup---less frequent cases tested last.
+ 3. Added list_extract, for extracting list slices (more general than
+ list_transfer).
+ 4. Incorporated changes from Loic Dachary: hash_free() has been
+ added for deleting all nodes; hash and compare functions
+ from the hash.c test code are now available to the user as
+ defaults if null pointers are given to hash_init() or
+ hash_create(); and hash_set_allocator restores the default
+ allocator routines if null pointers are given to it.
+ 5. Changes to dict analogous to hash: dict_free() added, etc.
+ 6. New exception handling module added (experimental).
+ 7. Much new documentation.
+
+New in 1.9
+
+ 1. Third argument of list_transfer may be null, in which case no nodes
+ are transferred. [Rationale: allows empty source list to be treated
+ without special case testing when all nodes are being transferred.]
+ 2. Two new functions added to dict: dict_upper_bound and dict_lower_bound.
+ These allow for inexact and range searches.
+
+New in 1.8
+
+ 1. New improved hashing function in the hash.c test code. It turns out that
+ when I changed the hash table algorithm, the blast.pl testcase was
+ hashing all to a single chain due to the pathologically bad hashing
+ function. The new hashing function should be good enough for general use.
+ It uses each nybble of the key to index a table of 16 random 32 bit integers.
+ These integers are XOR-ed into the hash value which is rotated after each
+ XOR.
+ 2. Spurious semicolon removed from the #define of HASH_VAL_T_BIT.
+ 3. I fixed some incorrect comments in hash.c which still talked about the
+ old algorithm from release 1.5 and older.
+ 4. The smalloc.c module is no longer supported. It's still in RCS but it's not
+ tagged as being part of release 1.8, and is not used by any of the other
+ sources. The standard library memory allocation functions are now used
+ directly. [Rationale: smalloc.c is overkill and interferes with
+ integration of the other source files into projects. Conscientious programmer
+ already ahve their own tools for debugging allocator corruption, anyway.]
+
+New in 1.7
+
+ 1. Missing #include <stdlib.h> added to smalloc.h
+ 2. The dict_delete() functions internals have been changed to make it much
+ more sane. This function no longer has the potential to return a node
+ other than the one that is passed to it.
+ 3. The changes to dict_delete() also fix a serious bug in dict_process().
+ The dict_process computes a pointer to a node's successor before
+ invoking the user callback to process a node. If the user callback calls
+ dict_delete() on the node, under the old dict_delete() semantics it was
+ possible for the successor to get deleted instead. Thus dict_process()
+ could end up with an invalid pointer.
+ 4. The changes to dict_delete() also mean that key and value information will
+ never be relocated from one node to another. User code can now rely on this
+ convenient assumption.
+
+New in 1.6
+
+ 1. The extendible hashing algorithm internals have changed. This
+ has a potential impact on the behavior with respect to hashing functions
+ which were written to work well specifically with the old hashing
+ scheme. For a silly reason, in the old hashing scheme, the top N bits
+ were always taken from the results of a hashing function, for a hash
+ table size of 2^N chains. In the new scheme, the bottom N bits are taken
+ instead. [Rationale: This is change makes it easier to write portable
+ hashing functions and simplifies the functions that expand or contract
+ the table, making them more efficient.]
+ 2. Added const qualifiers to the rcsid[] and right[] char arrays,
+ which shuts up the GCC compiler from complaining that these are
+ unused statics.
+
+New in 1.5
+
+ 1. First two arguments to list_prune_graft() are reversed. The leftmost
+ argument is now the destination list. Moreover, the function has been
+ renamed list_transfer(). [Rationale: this ordering of parameters is
+ consistent with list_merge(), and the standard C <string.h> functions
+ also pass destination pointers on the left. Renaming the function
+ protects against incorrect use.]
+
+ 2. Red-Black tree dictionaries now support duplicate keys. [Rationale:
+ duplicate keys could be useful in some applications.] When a dictionary
+ is created or initialized, it does not allow duplicate keys. The
+ function dict_allow_dupes() is used to set a flag in a dictionary to
+ henceforth allow duplicates. Once made, the decision to allow
+ duplicates cannot be reversed. [Rationale: toggling between allowing
+ and disallowing duplicates does not seem useful. Once duplicates are
+ admitted, there is no point in disallowing duplicates.] When a key is
+ sought in tree that currently allows duplicates, the leftmost node
+ containing that key is chosen from among the nodes that contain
+ duplicates of the key. Then dict_next() can be used to fetch the
+ remaining duplicates one by one. No particular order among the
+ duplicates may be assumed. However, for what it may be worth, the order
+ between any two duplicates is preserved for as long as they both remain
+ in the dictionary.
+
+ 3. The function prototypes in the header files have been modified to eliminate
+ parameter names. [Rationale: parameter names in prototypes have only
+ documentary value, and may clash with macro identifiers defined in other
+ headers.]
+
+ 4. Dictionary and hash table now has support for automatic allocation of
+ nodes in the insert and delete operations, which means that the user
+ can add items in one operation instead of the two operations of
+ allocating a node and inserting it. [Rationale: ease of use.] There is
+ support for user-defined allocators; the default allocators use the
+ smalloc.c routines. For any instance of a dict_t or hash_t object, the
+ user can override the allocator functions by supplying his or her
+ own pointers to suitable functions, and a context pointer that
+ will be passed to these functions when they are called through that
+ particular dict_t or hash_t instance. [Rationale: flexibility, ease of
+ use, promotes good design.] The funtion pointers can only be set when
+ the data structure is empty. [Rationale: it is undesirable to switch to
+ a different allocator when there are nodes in the dictionary; it might
+ lead to the error of freeing a node with an incorrect allocator.]
diff --git a/libutil/kazlib/docs/MUST_READ b/libutil/kazlib/docs/MUST_READ
new file mode 100644
index 0000000..20ca12e
--- /dev/null
+++ b/libutil/kazlib/docs/MUST_READ
@@ -0,0 +1,25 @@
+Greetings, Programmer!
+
+I gather that because you are reading this, you are probably considering using
+the C language translation units included here in your own software. If that
+is the case, I would like to know who you are and urge you to contact me.
+
+Here is why: I rove over this code periodically looking for defects. In fact,
+I use it in my own programming projects. If I discover a defect, I will
+notify everyone who I know is a user of this software. If there is a serious
+defect in some code that you are using in your software project, wouldn't you
+want to be informed? In fact, there is no question that you _need_ to be
+informed!
+
+Here is what you do: simply send an e-mail message to kaz at ashi.footprints.net
+with the subject "kazlib" and the body "I am a user". Be sure that your message
+has a good return address. I will manually add your e-mail address to a list
+which I will use only for the purpose of notifications regarding Kazlib. You
+will receive a reply to the effect that you are added.
+
+If ever you should wish to be removed from this list, simply ask and it shall
+be done.
+
+Yours in earnest,
+
+ Kaz Kylheku
diff --git a/libutil/kazlib/docs/README b/libutil/kazlib/docs/README
new file mode 100644
index 0000000..08f14a1
--- /dev/null
+++ b/libutil/kazlib/docs/README
@@ -0,0 +1,66 @@
+This collection of data structures is maintained by
+Kaz Kylheku <kaz at ashi.footprints.net>
+
+INSTRUCTIONS
+
+Simply add the necessary .c and .h files to your project. Include the
+appropriate .h file in any translation unit that interfaces with one or more of
+the kazlib modules. Then compile and link the modules together with your program.
+
+To use kazlib in a C++ project, don't compile them with a C++ compiler.
+Compile with a C compiler, and include the header files in
+your C++ translation units. Then link together the translated C and C++.
+As of release 1.2, the header files should work with C++.
+
+IMPORTANT NOTES
+
+1. Self checks
+
+The modules in this collection perform extensive self-checks, some of
+which make the performance really poor (by actually raising the overall
+asymptotic complexity of an operation, for example from O(log N) to O(N). The
+instrumentation assertions can be disabled by compiling with the NDEBUG macro
+defined.
+
+You can check that your project does not violate the principles of
+implementation hiding in connection with its use of the kazlib modules. This
+is accomplished by defining the macro KAZLIB_OPAQUE_DEBUG at the beginning of
+any translation unit which includes the kazlib header files. Note that
+whereas this will detect violations, it will not result in a translation
+that can be linked against the kazlib. When you are done checking, turn
+off KAZLIB_OPAQUE_DEBUG and recompile. If your compiler has a special ``check only''
+mode which enables it to perform syntax and type checking without doing
+an actual translation (similar to lint), it may be a time-saving idea to
+use it in conjunction with KAZLIB_OPAQUE_DEBUG.
+
+2. Macros with side effects
+
+Some of the kazlib header files define macros that evaluate their arguments
+more than once. This means that if expressions with side effects are passed
+to these macros, undesirable and undefined behavior will happen. There is
+support in Kazlib for catching these kinds of bugs: compile with
+KAZLIB_SIDEEFFECT_DEBUG, and add the except.c and sfx.c modules to your
+object. The macros will now parse their expressions at run time to diagnose
+the presence of side effects and function calls. It's easy to add this support
+to your own code!
+
+3. Thread support
+
+POSIX thread support is enabled by predefining KAZLIB_POSIX_THREADS. Currently
+only the exception-handling module has any need for this. When compiled that
+way, it provides thread-safe exception handling. Threads can independently
+throw exceptions and each thread can install its own specific catcher
+for unhandled exceptions. Moreover, each thread can register its own
+memory allocator functions.
+
+Note: this variant of the code also depends on the ability to cast between void
+* and function pointers, which is a common language extension.
+
+4. CVS identification
+
+The source files contain declarations of a static char array variable called
+rcsid. This contains an expansion of the CVS identification of each module,
+making it possible to determine the ``bill of materials'' that went into an
+executable build. I have now wrapped the declarations of these rcsid[] arrays
+so they are conditional on KAZLIB_RCSID being defined. For many users, these
+are just a waste of space.
diff --git a/libutil/kazlib/docs/docs.ist b/libutil/kazlib/docs/docs.ist
new file mode 100644
index 0000000..808c029
--- /dev/null
+++ b/libutil/kazlib/docs/docs.ist
@@ -0,0 +1,4 @@
+preamble
+"\\begin{theindex}\n\\addcontentsline{toc}{section}{Index}\n"
+postamble
+"\n\\end{theindex}\n"
diff --git a/libutil/kazlib/docs/docs.ltx b/libutil/kazlib/docs/docs.ltx
new file mode 100644
index 0000000..139f212
--- /dev/null
+++ b/libutil/kazlib/docs/docs.ltx
@@ -0,0 +1,4155 @@
+\documentclass{article}
+\usepackage{makeidx}
+\usepackage[margin=1.0in]{geometry}
+\makeatletter
+\newcommand{\defsubsection}{\@startsection
+ {subsection}
+ {2}
+ {0pt}
+ {2.0ex plus 0.1ex minus 0.05ex}
+ {-0pt}
+ {\normalfont\normalsize\bfseries}}
+\newcommand{\defsubsubsection}{\@startsection
+ {subsection}
+ {3}
+ {0ex}
+ {2.0ex plus 0.1ex minus 0.05ex}
+ {1.0ex}
+ {\normalfont\normalsize\bfseries}}
+\renewcommand{\paragraph}{\@startsection
+ {paragraph}
+ {4}
+ {0ex}
+ {2.0ex plus 0.1ex minus 0.05ex}
+ {1.0ex}
+ {\normalsize\bfseries}}
+\makeatother
+\title{Kazlib---Reusable Components\\for C Programming}
+\author{Kaz Kylheku}
+\date{Release 1.20\\July 24, 2001}
+\makeindex
+\setcounter{tocdepth}{1}
+\setcounter{secnumdepth}{4}
+\begin{document}
+\catcode`\_=11
+\def\indextype#1{\index{#1@{\tt #1} type}}
+\def\indexmacro#1{\index{#1@{\tt #1} macro}}
+\def\indexobject#1{\index{#1@{\tt #1} object}}
+\def\indexfunc#1{\index{#1@{\tt #1} function}}
+\def\indexenum#1{\index{#1@{\tt #1} enum constant}}
+\def\synopsis{\paragraph*{Synopsis}}
+\def\constraints{\paragraph*{Constraints}}
+\def\description{\paragraph*{Description}}
+\def\example{\paragraph*{Example}}
+\maketitle
+\abstract{The aim of the Kazlib project is to provide a well-documented
+programming interface featuring commonly needed programming abstractions,
+accompanied by a high quality, portable reference implementation.
+Kazlib consists of four independent components: a list module, a hash table
+module, a dictionary module and an exception handling module. The reference
+implementations of the first three of these are based on, respectively, the
+following algorithms: doubly linked circular list with sentinel node,
+extendible hashing, and red-black tree.}
+\tableofcontents
+\section{Introduction}
+This document establishes the provisions required of an implementation of the
+Kazlib library, and describes a reference implementation thereof.
+This document specifies
+\begin{itemize}
+\item the names and types of identifiers and preprocessor symbols made
+ available by each component;
+\item identifier name spaces reserved for future use by each component;
+\item the interface syntax and semantics of each component operation;
+\item the conditions required for the well-defined execution of each operation;
+\item the externally visible behavior of each component, including global
+ side effects and the effects on the subject data structures;
+
+\item and the implementation language of Kazlib.
+\end{itemize}
+Furthermore, this document describes, but does not specify
+\begin{itemize}
+\item the implementation details of structure objects manipulated by the
+ operations of each component;
+\item objects and functions that are defined by the implementation of
+ each component but are not externally visible;
+\item the algorithms and implementation details of the operations.
+\end{itemize}
+Finally, this document does {\em not\/} specify or describe
+\begin{itemize}
+\item the specific choices for parameters which may be adjusted by an
+ installation or implementation of Kazlib.
+\item the size of any data structure which will exceed the capacity of
+ a particular installation.
+\item the mechanisms or procedures for the translation of Kazlib and
+ their integration with other translation units.
+\end{itemize}
+
+\section{References}
+\label{sec:references}
+
+\begin{trivlist}
+\item ISO 9899:1990, {\it Programming Languages---C.}
+\item {\it Introduction to Algorithms}, Thomas H. Cormen, Charles E.
+Leiserson, Ronald L. Rivest, eighth printing, 1992.
+\end{trivlist}
+
+\section{Definitions and conventions}
+The following terms shall be interpreted in accordance with the definitions
+below. Other terms appearing in this document shall be defined upon their
+first mention, indicated by {\it italic\/} type. Any terms not explicitly
+defined in this document should be interpreted according to ISO 9899-1990,
+clause 3. Failing that, they should be interpreted according to other works
+listed in section \ref{sec:references}.
+\nobreak
+\defsubsection{implementation}: A library and set of C language headers
+which conforms to the specifications of this Document.
+\index{production mode}
+\indexmacro{NDEBUG}
+\defsubsection{production mode}: A mode of operating the implementation
+in such a way that maximum efficiency of execution is achieved at the expense
+of the verification of constraints. An implementation shall provide
+a production mode, which is enabled in an implementation-defined
+manner.\footnote{An implementation may have to supply a separate set of
+libraries for production and for verification use, for instance. The
+manner of selecting libraries varies with each programming environment.} Each
+translation unit of the program which includes a Kazlib header shall ensure that the macro {\tt
+NDEBUG} is defined prior to the inclusion of that header, otherwise the
+implementation is not said to be operated in production mode.
+\index{verification mode}
+\defsubsection{verification mode}: A mode of operating the implementation in
+such a way that maximum error checking is obtained at the cost of
+execution efficiency. An implementation shall provide a verification mode, which
+is enabled in an implementation-defined manner. If any translation unit which
+includes a Kazlib header defines the macro name {\tt NDEBUG}\footnote{The
+intent is that the standard {\tt assert} macro may be exploited
+by the implementation's headers for the purpose of provisioning verification
+mode.} prior to including that header, the implementation is not said to be in
+verification mode. The least requirements of a Kazlib implementation operated
+in verification mode, is that it shall stop translation or execution of any
+program which violates a constraint.
+\index{undefined behavior}
+\defsubsection{undefined behavior}: Behavior of a program, upon violation of a
+requirement with respect to the use of Kazlib, or upon use of corrupt or
+incorrect data, for which this document does not impose any requirements.
+Additional undefined behaviors are:
+\begin{itemize}
+\item any behavior that is undefined by the C language standard;
+\item evaluation of an object whose contents are indeterminate;
+\item a violation of any explicit constraint stated in
+this document, if that program was built using Kazlib in production
+mode;\footnote{The intent is that violations of constraints are diagnosed by
+the implementation in verification mode, and hence do not lead to undefined
+behavior.}
+\item a violation of any requirement stated in this document that
+is not designated as a constraint, and is introduced using the word
+{\it shall}; and
+\item any other construct for which no definition of behavior can be deduced
+from this document.
+\end{itemize}
+If a program invokes undefined behavior of any kind, the Kazlib implementation
+is absolved from any requirements as to what events should ensue. The
+implementation may respond by invoking undefined behavior in the C language
+sense, or it may detect the behavior and terminate with a diagnostic message.
+\defsubsection{implementation-defined}: An adjective which, when appearing
+in the description of a feature, represents a requirement that the
+implementor must supply a definition, and document that
+definition. This adjective is applied to both behavior and to results.
+Implementation-defined behavior is behavior which depends on the
+characteristics of an implementation.\footnote{It is not considered adequate
+for the implementor to allow implementation-defined behavior to produce
+unpredictable effects or to terminate the program when such behavior is
+invoked.} When said of a result,
+implementation-defined means that a value is successfully computed, but depends
+on the characteristics of the implementation. It is possible for the presence of a
+requirement on a program to be described as implementation-defined, giving the
+implementor a choice whether to make that requirement or not. If a program
+violates a requirement whose presence is implementation-defined, that program's
+behavior is undefined in any implementation which elects to in fact impose that
+requirement.
+\index{implementation-defined}
+\defsubsection{unpredictable result}: A successfully computed value which is
+unreliable because some procedure or data failed to satisfy a property required
+by the computation.
+\defsubsection{constraint}: A semantic restriction with which a program must
+comply. Some sections of this Document contain paragraphs under the heading
+{\it Constraints\/} which list all constraints pertaining to the described
+feature. When operated in production mode, the Kazlib implementation
+is not required to diagnose constraint violations. When operated in
+verification mode, the Kazlib implementation must halt translation or
+execution of a program which violates a constraint.
+\index{constraint}
+\defsubsection{comparison function}: A function which accepts two arguments
+\index{comparison function}
+of type \verb|const void *| and returns a value of type int based on
+a ranking comparison of these arguments, and which satisfies the following
+additional semantic properties. If the two arguments are deemed to be equal, the
+function must return zero. If the first argument is determined to have a
+greater rank than the second, a positive value is returned. Otherwise if the
+first argument is determined to have a lesser rank than the second, a negative
+value is returned. The rank is computed as if each value has associated with it
+an integer, not necessarily unique, and as if these integers are compared for ordinary equality or
+inequality when values are said to be compared. The assignment of integers is
+up to the designer of the comparison function, and does not change between
+successive invocations of the function.\footnote{Of course, an actual
+comparison function need not assign actual integer ranks to data items, but it
+must behave as if such ranks were assigned.}
+If a comparison function is invoked in the context of an operation on some data
+structure, it shall not invoke any operation on any component of that same
+structure.\footnote{Thus, if a comparison function is invoked from, for
+instance, {\tt list_sort}, it must not call any list operations that
+inspect or modify the list being sorted, or any of its constituent nodes.}
+\defsubsection{opaque data type}: A data type whose precise definition is
+not documented, and which is intended to be manipulated only using the
+documented interface, which consists of a set of functions. Many data types in
+Kazlib are described as opaque. A program which bypasses the documented
+interfaces in inspecting or manipulating these data types invokes undefined
+behavior, and is not portable among Kazlib implementations.
+\defsubsection{user}: \index{user} The program which uses Kazlib.
+\defsubsection{user data}: \index{user data} Data provided by the program
+to which Kazlib stores a pointer, but otherwise does not inspect or modify.
+
+\section{Environment}
+\label{sec:environment}
+
+The translation and use of Kazlib requires a conforming, hosted implementation
+of the C language which meets the following additional minimal requirements:
+\begin{enumerate}
+\item The C implementation distinguishes external names by at least their
+initial 15 characters\footnote{The ISO 9899:1990 standard demands only that
+external names be distinguished by their initial six characters.}. External
+names that are distinct in their first 15 characters are treated by the
+implementation as distinct names. Upper and lower case letters in external
+identifiers need not be treated as distinct.
+\item The C implementation does not claim the identifier \verb|__cplusplus|
+for its internal use as a preprocessor symbol or keyword.
+\end{enumerate}
+If Kazlib headers are used by a C++ program, the C++ implementation
+meets these additional requirements:
+\begin{enumerate}
+\item the C++ implementation identifies itself by predefining the preprocessor
+symbol \verb|__cplusplus|;
+\item the C++ implementation is be capable of linkage against
+the C implementation with which the Kazlib source files units were translated.
+\end{enumerate}
+The Kazlib headers shall not make use of any names that are claimed
+by the C++ programming language, and shall ensure that the \verb|extern "C"|
+mechanism is used for all declarations when they are included into a C++
+translation unit, or otherwise provide compatibility with C++.\footnote{The
+intent is that the Kazlib implementation could, in principle, provide
+a separate set of headers for use with each language.}
+
+In programming environments that support the programming mechanism of multiple
+threads of execution an implementation of Kazlib may be designated as {\it
+thread safe}. To be called thread safe, it must guarantee that the use of an
+object by one thread cannot visibly interact or interfere with the concurrent
+or interleaved use of another object by another thread. If a Kazlib
+implementation that is not thread safe is provided for an environment which
+supports threads, it shall be accompanied by documentation which describes
+the extent of this limitation.
+
+A Kazlib implementation can also be designated as being {\it async safe}.
+The minimum requirement for this designation is that an operation on an object
+can be interrupted by delivery of an asynchronous signal and from within the
+catching function for that signal, it is safe to perform an operation on
+another object. An implementation shall document that it is async safe,
+or the extent to which it fails to be async safe.
+
+\section{General restrictions}
+
+\subsection{Headers}
+
+The Kazlib headers may be included in any order, and may be included more than
+once. Prior to the inclusion of a Kazlib header, the translation unit shall not
+define any macro name that has the same spelling as a C language keyword. The
+Kazlib headers may behave as though they include arbitrary standard C headers,
+so any requirements related to the inclusion of standard headers apply to
+Kazlib headers. A header shall be included before the first reference to any
+of the functions, types or macros that it defines.
+
+If one or more preprocessor symbols whose names begin with the sequence
+\verb|KAZLIB_| are defined prior to the inclusion of a Kazlib header,
+the behavior is implementation-defined.
+
+\subsection{Reserved macros}
+
+A Kazlib header defines all of the macros explicitly listed in the section of
+this document that defines the contents of that header. It may also define
+additional macros that belong to the macro namespace reserved by that header.
+The translation unit that includes the header shall not \verb|#define| or
+\verb|#undef| any of these macros.
+
+A header may define function-like macros that supplement existing functions,
+provided that such macros do not cause multiple evaluation of arguments except
+as explicitly permitted, and are safe to use wherever the corresponding
+function call would be. These function-like macros may be subject to
+\verb|#undef|.\footnote{In principle, an implementation may provide, within the
+reserved namespaces, additional functions not specified in this document, and
+function-like macro equivalents of these functions. A program that uses such
+identifiers in a block or function scope should use {\tt \#undef} on these
+identifiers prior to their use.}
+
+\subsection{Reserved symbols}
+
+Each Kazlib header provides file scope declarations for the typedef names,
+struct tags, enum constants and function names listed in its corresponding
+section in this document. Moreover, each header may define additional such
+names that fall into the documented reserved namespaces.
+
+The behavior is undefined if a translation unit that includes a Kazlib header
+defines any identifier that is the same as an identifier reserved by the header
+in the same scope and namespace.\footnote{Therefore, it is permitted to redeclare
+or redefine the identifiers reserved by a previously included Kazlib header,
+provided that the declarations or definitions are in a different namespace or
+scope. Reserved names may be redeclared in a block scope, or used as
+statement labels which have function scope and are in their own namespace.}
+
+The behavior is also undefined if the program contains a definition of an
+object or function with external linkage whose name matches an external object
+of unction defined by Kazlib component that is used as part of the
+program, or whose name is in a namespace reserved by that
+component.\footnote{This restriction exists whether or not the corresponding Kazlib
+header is included.}
+
+Lastly, the behavior is undefined if a translation unit defines a macro whose
+name is in the space of reserved symbols of a Kazlib header that is included in
+that translation unit.
+
+\subsection{Argument aliasing}
+
+Kazlib provides functions that operate on objects of various types. Pointers
+to objects are passed to these functions, thereby giving rise to the
+possibility of {\it aliasing}---passing of objects that wholly or partially
+overlap. The program shall not present aliased objects to any Kazlib function.
+Objects of distinct types shall not be aliased in a function call under any
+circumstances.
+The aliasing of two or more objects of compatible type is permitted only as
+explicitly documented in the description of a function; in all such
+circumstances, only exact overlapping is permitted.\footnote{That is to say,
+where explicitly allowed, a pointer to the same object may be specified for two
+(or more) parameters of like type.}
+
+\subsection{Object initialization}
+
+The Kazlib opaque data types can only be initialized with the initialization
+functions provided by the Kazlib library, or by implementation-defined
+initialization functions.\footnote{Of course, the use of implementation-defined
+functions results in programs that are not portable among library
+implementations.} An opaque object that is initialized by a method other than
+by being passed to an appropriate initialization function, or that is not
+initialized at all, has indeterminate contents. A pointer to an object having
+indeterminate contents may be passed to an initialization function; the object
+then has well-determined contents.
+
+An object whose initialization function is capable of indicating failure is
+considered indeterminate if the attempt to initialize that object using that
+function does in fact fail. The program shall not attempt to deinitialize such
+an object. The implementation shall reclaim any resources that were allocated
+for an object whose initialization failed. This reclamation need
+not be immediate, but may be delayed; however, the delay shall not
+give rise to the possibility of resource leaks in any correct program.
+
+Those objects for which deinitialization operations are defined should be
+subject to these operations when these objects are no longer needed. Failure
+to apply the deinitialization functions may result in the leakage of resources.
+
+\subsection{Object copying}
+
+Certain data types may be sensitive to their own location in memory. This
+means that copying their values by assignment or \verb|memcpy| results in the
+copy having an indeterminate value which cannot be used. All opaque types in
+Kazlib are assumed to have this property; copying the value of an opaquely
+typed object to another suitably typed object causes the destination
+object to have indeterminate contents.
+
+\section{List component}
+
+The List component provides a set of functions, macros and type declarations
+which together provide a library for maintaining a possibly empty ordered set
+of elements, called a {\it list}. This list has the following properties:
+\index{List}\begin{enumerate}
+\item If the list is not empty, a first and last element can be identified.
+ In a list having only one element, that one element is both the first and
+ last element.
+\item Each element that is not the last element has another element as its
+ {\it successor}.
+ \index{successor!of a list element}
+ \index{List!successor of an element}
+\item Each element that is not the first element has a {\it
+ predecessor}.
+ \index{predecessor!of a list element}
+ \index{List!predecessor of an element}
+\item No element is the predecessor or successor of more than one element.
+\item If one element is the successor of another, the other is necessarily the
+ predecessor of the first.
+\item Each element is associated with arbitrary {\it satellite\/} data.
+\end{enumerate}
+The {\it size} of a list, also known as the {\it list count}, is simply the
+number of elements contained in it.\index{size!of a list}\index{List!count}
+
+A list imposes a maximum value on the number of nodes that may be in it
+simultaneously. This is known as the list's {\it capacity}. A list that
+has the maximum number of nodes is said to be full.
+
+\subsection{Interface}
+
+\subsubsection{The {\tt list.h} header}
+
+Each C or C++ translation unit that is to use the functionality of
+the List component shall include the header \verb|list.h|. This header
+shall contain declarations of types and external functions, and definitions of
+macros.
+The following typedef names shall be defined:\index{List!typedef names}
+\index{typedefs!defined by List}
+\begin{verbatim}
+ list_t listcount_t
+ lnode_t lnodepool_t
+\end{verbatim}
+In addition, the following structure tags may be defined:\index{List!tag names}
+\index{tags!defined by List}
+\begin{verbatim}
+ struct list_t
+ struct lnode_t
+ struct lnodepool_t
+\end{verbatim}
+The following external function names shall be declared:
+\index{List!function names}\index{functions!defined by List}
+\begin{verbatim}
+ list_append list_prev
+ list_contains list_process
+ list_count list_return_nodes
+ list_create list_sort
+ list_del_first list_find
+ list_del_last list_transfer
+ list_delete list_verify
+ list_destroy lnode_borrow
+ list_destroy_nodes lnode_create
+ list_extract lnode_destroy
+ list_first lnode_get
+ list_init lnode_init
+ list_ins_after lnode_is_in_a_list
+ list_ins_before lnode_pool_create
+ list_is_sorted lnode_pool_destroy
+ list_isempty lnode_pool_init
+ list_isfull lnode_pool_isempty
+ list_last lnode_pool_isfrom
+ list_merge lnode_put
+ list_next lnode_return
+ list_prepend
+\end{verbatim}
+The following preprocessor symbols (macros) shall be defined:
+\index{List!macro names}\index{macros!defined by List}
+\indexmacro{LISTCOUNT_T_MAX}
+\indexmacro{LIST_H}
+\begin{verbatim}
+ LISTCOUNT_T_MAX
+ LIST_H\end{verbatim}
+\index{symbols!reserved by List}\index{List!reserved symbols}
+Macro identifiers which begin with the upper-case prefix \verb|LIST| are
+reserved for future extensions to the \verb|list.h| header, as are
+names in the ordinary and tag namespaces which begin with
+\verb|list_| or \verb|lnode_|. External names which begin with \verb|list_| or
+\verb|lnode_| are reserved by the Kazlib library regardless of what header
+files are included.
+
+\subsubsection{The {\tt list_t} type}
+
+\indextype{list_t}
+The type \verb|list_t| is an opaque data type which maintains information about the
+current state of a single list. A list consists of an instance of the
+\verb|list_t| type, plus zero or more instances of the type \verb|lnode_t|. An
+instance of the \verb|list_t| type can be dynamically created using the
+\verb|list_create| function, and destroyed by the \verb|list_destroy| function.
+Alternately, the program can declare an object of type \verb|list_t| and have
+it initialized via the \verb|list_init| function.
+
+\subsubsection{The {\tt listcount_t} type}
+
+\indextype{listcount_t}
+\indexmacro{LISTCOUNT_T_MAX}
+The type \verb|listcount_t| is an unsigned integral type which represents
+the number of nodes in a list. The specific choice of unsigned integral type
+is implementation defined. The \verb|LISTCOUNT_T_MAX| macro expands to a
+constant expression of type \verb|listcount_t| which specifies the maximum
+value of that type.\footnote{For example, if the implementation defines
+{\tt listcount_t} as an alias for the type unsigned long, then
+{\tt LISTCOUNT_T_MAX} must have the same value as {\tt ULONG_MAX}.}
+
+\subsubsection{The {\tt lnode_t} type}
+
+\indextype{lnode_t}
+The type \verb|lnode_t| is an opaque type that represents a single node of a
+list. A node contains a a reference to satellite data provided by the user,
+and also stores the key that is associated with the node when it is inserted.
+Nodes may be dynamically created by the \verb|lnode_create| function.
+Alternately, the program may supply an \verb|lnode_t| object that can be
+initialized by the \verb|lnode_init| function.
+
+\subsubsection{The {\tt lnodepool_t} type}
+
+\indextype{lnodepool_t}
+The \verb|lnodepool_t| type provides an alternate method for supplying list
+nodes to the application. A user-supplied or dynamically allocated fixed size
+array of nodes is converted into a a {\it pool\/} of nodes from which free
+nodes may be obtained and to which they may be returned. A user-supplied node
+pool is created by the function \verb|lnode_pool_init| which requires a pointer
+to an object of type \verb|lnode_pool_t|, a pointer to the first element of an
+array of \verb|lnode_t| objects, as well as an integer representing the size of
+the array. Alternately, the function \verb|lnode_pool_create| will dynamically
+allocate an object of type \verb|lnode_pool_t| containing the specified number
+of list nodes.
+
+\subsubsection{The {\tt list_append} function}
+
+ \indexfunc{list_append}
+ \index{List!appending a node}
+ \index{append node to list}
+ \synopsis
+ \begin{verbatim}
+ void list_append(list_t *, lnode_t *);\end{verbatim}
+
+ \constraints
+ The second argument shall not refer to a node that is already in a list
+ or in a list node pool. The first argument shall not refer to a list
+ that is full.
+
+ \description
+ The append operation causes the node pointed at by the second
+ argument to become the last node in the list pointed at by the first
+ argument.\footnote{That is to say, after the operation, the
+ {\tt list_last} function, when applied to the list, shall return a pointer
+ to that node.}
+
+ If the first argument is an expression with side effects, the behavior
+ is undefined.\footnote{Thus, the implementation may provide a macro
+ version of {\tt list_append} which evaluates the first argument
+ more than once.}
+ \index{macros!and side effects}
+
+\subsubsection{The {\tt list_contains} function}
+
+ \indexfunc{list_contains}
+ \index{List!testing for presence of node}
+ \nobreak
+ \synopsis
+ \begin{verbatim}
+ int list_contains(list_t *, lnode_t *node);\end{verbatim}
+ \nobreak
+ \description
+ \nobreak
+ The \verb|list_contains| function shall return 1 if the node
+ pointed at by the second argument is in the list pointed at by the first
+ argument. Otherwise, it shall return 0.
+
+\subsubsection{The {\tt list_count} function}
+
+ \indexfunc{list_count}
+ \index{List!count}
+ \index{List!size}
+ \synopsis
+ \begin{verbatim}
+ listcount_t list_count(list_t *);\end{verbatim}
+
+ \description
+
+ The \verb|list_count| function returns a value which represents the number
+ of nodes currently stored in the list pointed at by the argument.
+
+\subsubsection{The {\tt list_create} function}
+
+ \indexfunc{list_create}
+ \index{List!creation of}
+ \index{create!list object}
+ \synopsis
+ \begin{verbatim}
+ list_t *list_create(listcount_t);\end{verbatim}
+
+ \description
+ The \verb|list_create| function instantiates and initializes an object of
+ type \verb|list_t|, and returns a pointer to it unless insufficient
+ resources exist for the creation of the object, in which case a null
+ pointer is returned.
+
+ The value of the function's argument establishes, for the entire duration
+ of the list object, its capacity.
+
+ The newly created list object is empty.
+
+\subsubsection{The {\tt list_del_first} function}
+
+ \index{List!first node}
+ \indexfunc{list_del_first}
+ \index{List!deletion}
+ \index{delete!first node of a list}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *list_del_first(list_t *);\end{verbatim}
+
+ \constraints
+ The argument shall not point to an empty list.
+
+ \description
+ The \verb|list_del_first| function removes the first node from the
+ list pointed at by the argument and returns a pointer to that
+ node.
+
+ If the argument is an expression with side effects, the behavior is
+ undefined.\index{macros!and side effects}
+
+\subsubsection{The {\tt list_del_last} function}
+
+ \index{List!last node}
+ \indexfunc{list_del_last}
+ \index{List!deletion}
+ \index{delete!last node of a list}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *list_del_last(list_t *);\end{verbatim}
+
+ \constraints
+ The argument shall not point to an empty list.
+
+ \description
+ The \verb|list_del_last| function removes the last node from the list
+ specified by the argument, and returns a pointer to that node. If,
+ prior to the operation, that node had a predecessor, that predecessor
+ shall become the new last node of the list. Otherwise, the list
+ shall become empty.
+
+ The new value of the list count shall be one less than its value
+ prior to the call to this function.
+
+ If the argument is an expression with side effects, the behavior is
+ undefined.\index{macros!and side effects}
+
+\subsubsection{The {\tt list_delete} function}
+
+ \indexfunc{list_delete}
+ \index{List!deletion}
+ \index{delete!arbitrary node of a list}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *list_delete(list_t *, lnode_t *);\end{verbatim}
+
+ \constraints
+ The second argument shall point to a node that is inside the list
+ pointed at by the first argument.
+
+ \description
+ The \verb|list_delete| function removes the node pointed at by its
+ second argument from the list pointed at by its first argument.
+ A pointer to the deleted node is returned.
+
+\subsubsection{The {\tt list_destroy} function}
+
+ \indexfunc{list_destroy}
+ \index{List!destruction of}
+ \synopsis
+ \begin{verbatim}
+ void list_destroy(list_t *);\end{verbatim}
+
+ \constraints
+ The argument shall point to an empty list.
+
+ \description
+ The empty list pointed at by the argument is destroyed. If the list has
+ not been created by a call to the \verb|list_create| function, the
+ behavior is undefined.
+
+ A pointer that previously referred to a list that has been disposed by
+ \verb|list_destroy| has an indeterminate value.
+
+\subsubsection{The {\tt list_destroy_nodes} function}
+
+ \indexfunc{list_destroy_nodes}
+ \synopsis
+ \begin{verbatim}
+ void list_destroy_nodes(list_t *);\end{verbatim}
+
+ \description
+ The nodes, if any, contained in the list pointed at by the argument are
+ disposed of as if by a call to the \verb|lnode_destroy| function. If any
+ node contained in the list was created by means other than the
+ \verb|lnode_create| function, the behavior is undefined.
+
+ After the operation, the list is empty.
+
+ Any pointer that referred to any of the destroyed nodes takes on an
+ indeterminate value.
+
+\subsubsection{The {\tt list_extract} function}
+
+ \index{List!node range extraction}
+ \indexfunc{list_extract}
+ \synopsis
+ \begin{verbatim}
+ void list_extract(list_t *, list_t *, lnode_t *, lnode_t *);\end{verbatim}
+
+ \constraints
+ The second argument points to the {\it source list}. The third
+ argument is either null, or points to a node that is an occupant
+ of the source list. This node is called the {\it starting node}.
+ The fourth argument is either null, or points to a node that is
+ an occupant of the source list. This node is called the {\it ending
+ node}. If the starting node and ending node are both specified, and are
+ distinct nodes, then the starting node shall appear earlier in the source
+ list than the ending node.
+
+ The transfer request shall not call for the capacity of the destination
+ list to be exceeded.
+
+ \description
+ The \verb|list_extract| function moves nodes from the source
+ list to the {\it destination list\/} pointed at by the first
+ argument.\footnote{This right-to-left direction of transfer is consistent
+ with the semantics of standard C library functions such as {\tt memmove} or
+ {\tt strcpy}.}
+
+ If the third and fourth arguments are not null, the entire range of nodes
+ from the starting node and to the ending node, inclusive, is transferred
+ from the source list to the end of the destination list, where they appear
+ in their original order. Other nodes in the source list, if any, are
+ unaffected.
+
+ If the third and fourth arguments both point to the same node, that
+ node alone is transferred to the end of the destination list.
+
+ If either the third argument or the fourth argument is null, or both are null,
+ no transfer of nodes takes place.
+
+ The source and destination list may be the same object.
+
+\subsubsection{The {\tt list_first} function}
+
+ \index{List!first node}
+ \indexfunc{list_first}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *list_first(list_t *);\end{verbatim}
+
+ \description
+ If the list pointed at by the argument is an empty list, a null pointer
+ is returned. Otherwise, a pointer to the first node in that list is
+ returned.
+
+ If the argument is an expression with side effects, the behavior is
+ undefined.\index{macros!and side effects}
+
+\subsubsection{The {\tt list_init} function}
+
+ \indexfunc{list_init}
+ \synopsis
+ \begin{verbatim}
+ list_t *list_init(list_t *, listcount_t);\end{verbatim}
+
+ \constraints
+ The second argument shall not have a zero value.
+
+ \description
+ The \verb|list_init| function initializes the list object pointed at by the
+ first argument, turning it into a valid, empty list. If the object is an
+ already initialized list, the behavior is undefined. A list returned by
+ \verb|list_create| is considered initialized. The second argument
+ specifies the maximum number of nodes that may simultaneously occupy the
+ list.
+
+ The value returned is that of the first argument.
+
+\subsubsection{The {\tt list_ins_after} function}
+
+ \indexfunc{list_ins_after}
+ \index{insert!node into list}
+ \index{List!insertion}
+ \synopsis
+ \begin{verbatim}
+ void list_ins_after(list_t *, lnode_t *, lnode_t *);\end{verbatim}
+
+ \constraints
+ The first argument shall point to a list that is not already full. The
+ second argument shall point to a node, called the {\it new node}, that is not
+ already an occupant of the list pointed at by the first argument, nor
+ of any other list or node pool object. The third
+ argument shall point to a node, called the {\it reference node}, that is an
+ occupant of the list.
+
+ \description
+ The new node becomes an occupant of the list, such that its predecessor
+ is the reference node. If the reference node has a successor, the
+ new node is inserted between the reference node and that successor.
+ Otherwise, the new node becomes the last node of the list.
+
+\subsubsection{The {\tt list_ins_before} function}
+
+ \indexfunc{list_ins_before}
+ \index{insert!node into list}
+ \index{List!insertion}
+ \synopsis
+ \begin{verbatim}
+ void list_ins_before(list_t *, lnode_t *, lnode_t *);\end{verbatim}
+
+ \constraints
+ The first argument shall point to a list that is not already full. The
+ second argument shall point to a node, called the {\it new node}, that is not
+ already an occupant of the list pointed at by the first argument, nor
+ of any other list or node pool object. The third
+ argument shall point to a node, called the {\it reference node}, that is an
+ occupant of the list.
+
+ \description
+ The new node becomes an occupant of the list, such that its successor
+ is the reference node. If the reference node has a predecessor, the
+ new node is inserted between the reference node and that predecessor.
+ Otherwise, the new node becomes the first node of the list.
+
+\subsubsection{The {\tt list_is_sorted} function}
+\label{list:is:sorted}
+ \indexfunc{list_is_sorted}
+
+ \synopsis
+ \begin{verbatim}
+ int list_is_sorted(list_t *,
+ int (const void *, const void *));\end{verbatim}
+
+ \description
+ The first argument points to a list object. The second is assumed to
+ point to a comparison function.
+
+ If the list has exactly one node or is empty, $1$ is returned
+ unconditionally. Otherwise, nodes of the list are examined to
+ determine whether they are in a sorted order according to the comparison
+ function. This is true if the integer ranks of their data items,
+ examined from the first node of the list through to the last node, form a
+ monotonically increasing sequence. If the nodes are in order, the value $1$
+ is returned. Otherwise $0$ is returned.
+
+ If the list has two or more nodes, and the second argument is a pointer to
+ a function that has the correct type, but does not satisfy the semantic
+ properties of a comparison function, the result is unpredictable, but is
+ guaranteed to be one of the values~$0$~or~$1$.
+
+\subsubsection{The {\tt list_isempty} function}
+
+ \indexfunc{list_isempty}
+ \synopsis
+ \begin{verbatim}
+ int list_isempty(list_t *);\end{verbatim}
+
+ \description
+ The \verb|list_isempty| function returns $1$ if the list pointed at by
+ the first argument is empty. Otherwise it returns $0$.
+
+\subsubsection{The {\tt list_isfull} function}
+
+ \indexfunc{list_isfull}
+ \synopsis
+ \begin{verbatim}
+ int list_isfull(list_t *);\end{verbatim}
+
+ \description
+ The \verb|list_isfull| function returns $1$ if the list pointed at by
+ the first argument is full. Otherwise it returns $0$.
+ A list is considered full when it contains the maximum number of nodes
+ that was specified upon its initialization.
+
+ If the argument is an expression with side effects, the behavior is
+ undefined.\index{macros!and side effects}
+
+\subsubsection{The {\tt list_last} function}
+
+ \index{List!last node}
+ \indexfunc{list_last}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *list_last(list_t *);\end{verbatim}
+
+ \description
+ If the list pointed at by its first argument is empty, the \verb|list_last|
+ function returns a null pointer. Otherwise it returns a pointer to the
+ last node.
+
+ If the argument is an expression with side effects, the behavior is
+ undefined.\index{macros!and side effects}
+
+\subsubsection{The {\tt list_merge} function}
+
+ \index{List!merge operation}
+ \indexfunc{list_merge}
+ \synopsis
+ \begin{verbatim}
+ void list_merge(list_t *, list_t *,
+ int (const void *, const void *));\end{verbatim}
+
+ \constraints
+ The list pointed at by the first argument is called the {\it destination
+ list}. The second argument points to the {\it source list}. The third
+ argument points to a comparison function. The sum of the number of nodes
+ occupying the source list and the destination list shall not exceed the
+ maximum number of nodes that are permitted to occupy the destination list.
+ Furthermore, both the source and destination list shall be sorted such that
+ a call to \verb|list_is_sorted| given a pointer to either list as a first
+ argument, and the pointer to the comparison function as its second
+ argument, shall yield the value $1$.
+
+ \description
+ Nodes from the sorted source list are merged into the sorted destination
+ list. After the operation, the source list is empty and the destination
+ list contains all of the nodes it contained prior to the operation, as well
+ as all of the nodes that the source list contained. The nodes are in sorted
+ order according to the comparison function.
+
+ If the third argument is a pointer to a function that has the correct type,
+ but does not fulfill the semantic properties of a comparison function, the
+ order of the nodes in the destination list is unpredictable.
+
+ If the source and destination list are the same object, the
+ \verb|list_merge| operation has no effect.
+
+\subsubsection{The {\tt list_next} function}
+
+ \indexfunc{list_next}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *list_next(list_t *, lnode_t *);\end{verbatim}
+
+ \constraints
+ The node pointed at by the second argument is an occupant of the list pointed
+ at by the first argument.
+
+ \description
+ If the node pointed at by the second argument has a successor, a pointer to
+ that successor is returned. Otherwise, a null pointer is returned.
+
+ If the second argument is an expression which has side effects, the behavior
+ is undefined.\index{macros!and side effects}
+
+\subsubsection{The {\tt list_prepend} function}
+
+ \indexfunc{list_prepend}
+ \index{List!prepending a node}
+ \index{prepend node to list}
+ \synopsis
+ \begin{verbatim}
+ void list_prepend(list_t *, lnode_t *);\end{verbatim}
+
+ \constraints
+ The second argument shall not refer to a node that is already in a list
+ or in a list node pool. The first argument shall not refer to a list
+ that is full.
+
+ \description
+ The prepend operation causes the node pointed at by the second
+ argument to become the first node in the list pointed at by the first
+ argument. After the operation, the \verb|list_first| function, when
+ applied to the list, shall return a pointer to that node.
+ If, prior to to the operation, the list is empty, then the prepended node
+ shall become the first node in that list, otherwise, the prepended node
+ becomes the predecessor of what was previously the first node.
+
+ If the first argument is an expression with side effects, the behavior
+ is undefined.\index{macros!and side effects}
+
+\subsubsection{The {\tt list_prev} function}
+
+ \indexfunc{list_prev}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *list_prev(list_t *, lnode_t *);\end{verbatim}
+
+ \constraints
+ The node pointed at by the second argument is an occupant of the list pointed
+ at by the first argument.
+
+ \description
+ If the node pointed at by the second argument has a predecessor, a pointer to
+ that predecessor is returned. Otherwise, a null pointer is returned.
+
+ If the second argument is an expression which has side effects, the behavior
+ \index{macros!and side effects}
+ is undefined.
+
+\subsubsection{The {\tt list_process} function}
+
+ \indexfunc{list_process}
+ \synopsis
+ \begin{verbatim}
+ void list_process(list_t *, void *,
+ void (*)(list_t *, lnode_t *, void *));\end{verbatim}
+ \nobreak
+ \description
+ The \verb|list_process| function iterates over the nodes of a list,
+ and for each node invokes a callback function.\footnote{In most cases,
+ it is more convenient and preferable to
+ iterate over the list using explicit calls to {\tt list_first}
+ and {\tt list_next}.}
+ The second argument is a {\it context pointer\/} which can have any value.
+ The third argument of
+ \verb|list_process| shall be a pointer to a function which is compatible
+ with the specified type. If the list contains one or more nodes,
+ then the function is invoked once for each node, in order from first
+ to last. On each invocation, the first argument of the callback is a
+ pointer to the list; the second argument is a pointer to a node, called
+ the {\it subject node}; and the third argument repeats the context pointer
+ value that was originally passed to \verb|list_process|.
+
+ The callback function may delete the subject node by, for instance, calling
+ \verb|list_delete|. It may insert new nodes to any place in the list;
+ however, if such an insertion causes the subject node to acquire
+ a new successor, it is implementation-defined whether upon returning
+ from the callback function, the traversal shall continue with the
+ new successor, or with the original successor.
+
+ The callback function, and any function invoked from the callback
+ function, shall not destroy the list or make any modifications
+ other than the insertion of new nodes, or the deletion of the
+ subject node.
+
+ The callback function may recursively invoke \verb|list_process| for the
+ same list or for a different list; the callback invocations arising out of
+ the nested call inherit all of the restrictions of the outer callback in
+ addition to being subject to the usual restrictions.\footnote{This means,
+ for instance, that if two callbacks are in progress for different
+ subject nodes from the same list, the inner callback may not delete
+ its subject node, because it inherits the restriction that the only
+ permitted deletion is the outer callback's subject node.}
+
+ The callback function may freely operate on a different list,
+ subject to any inherited restrictions.
+
+\subsubsection{The {\tt list_return_nodes} function}
+
+ \indexfunc{list_return_nodes}
+ \synopsis
+ \begin{verbatim}
+ void list_return_nodes(list_t *, lnodepool_t *);\end{verbatim}
+
+ \description
+
+ Every node in the list specified by the first argument
+ is returned to the node pool specified by the second argument
+ If the list contains a node that has not been allocated
+ from that node pool, the behavior is undefined.
+
+\subsubsection{The {\tt list_sort} function}
+
+ \index{List!sort operation}
+ \indexfunc{list_sort}
+ \synopsis
+ \begin{verbatim}
+ void list_sort(list_t *, int (const void *, const void *));\end{verbatim}
+
+ \description
+
+ The \verb|list_sort| function changes the order of the nodes of the list
+ specified by the first argument according to the comparison function
+ pointed at by the second argument.
+
+ If the list is empty, or contains only one node, the comparison function is
+ not called.
+
+ Whenever the comparison function is invoked, its arguments are are the data
+ pointers stored in two distinct nodes of the list.
+
+\subsubsection{The {\tt list_find} function}
+
+ \index{List!find operation}
+ \indexfunc{list_find}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *list_find(list_t *,
+ const void *, int (const void *, const void *));\end{verbatim}
+
+ \description
+
+ The \verb|list_find| function exhaustively searches the key for a node
+ whose satellite data matches a search key according to the comparison
+ function. The first argument is the list to be searched, the second
+ argument specifies the search key and the third argument is a pointer
+ to the comparison function.
+
+ The comparison function is invoked to compare the key against the
+ satellite data of successive nodes of the list, starting with the first
+ node. A pointer to the first node for which the comparison function returns
+ zero is returned.
+
+ If the list is empty, or the comparison function returns non-zero for
+ each item, a null pointer is returned.
+
+\subsubsection{The {\tt list_transfer} function}
+
+ \index{List!node transfer}
+ \indexfunc{list_transfer}
+ \synopsis
+ \begin{verbatim}
+ void list_transfer(list_t *, list_t *, lnode_t *);\end{verbatim}
+
+ \constraints
+ The third argument is either null, or it points at a node which is an
+ occupant of the list pointed at by the second argument.
+
+ The transfer request shall not call for the capacity of the destination
+ list to be exceeded.
+
+ \description
+ The \verb|list_transfer| function moves nodes from the list
+ pointed at by the second argument to the list pointed at by
+ the first argument.
+
+ If the third argument is not null, it specifies the node in the source list
+ at which the transfer begins. That node, its successor, and all
+ subsequent nodes, are transferred to the end of the destination list where
+ they appear in their original order. Other nodes in the source list are
+ unaffected.
+
+ If the third argument is null, no transfer of nodes takes place.
+
+ The source and destination list may be the same object.
+
+ If \verb|DL|, \verb|SL| and \verb|SN| are appropriately typed expressions,
+ the function call
+
+\begin{verbatim}
+ void list_transfer(DL, SL, SN);
+\end{verbatim}
+ is equivalent to
+\begin{verbatim}
+ list_extract(DL, SL, SN, list_last(SL));
+\end{verbatim}
+ except that \verb|SL| is evaluated only once.
+
+\subsubsection{The {\tt list_verify} function}
+
+ \indexfunc{list_verify}
+ \synopsis
+ \begin{verbatim}
+ int list_verify(list_t *list);\end{verbatim}
+
+ \description
+ The intent of the \verb|list_verify| function is to perform a verification
+ on the list object, regardless of whether the Kazlib implementation is
+ operated in verification or production mode. If the list objects
+ and its constituent nodes have been correctly manipulated, and the
+ program has not caused any undefined behaviors, the value $1$ is returned.
+ Otherwise, the function may be able to, but is not guaranteed to, detect
+ corruption, and return the value zero.
+
+\subsubsection{The {\tt lnode_borrow} function}
+
+ \indexfunc{lnode_borrow}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *lnode_borrow(lnodepool_t *, void *);\end{verbatim}
+
+ \description
+
+ The \verb|lnode_borrow| function allocates a node from
+ the pool managed by the given \verb|lnodepool_t| object.
+ If the request succeeds, a pointer to the node is returned. If the object
+ has run out of nodes, the return value is a null pointer.
+
+\subsubsection{The {\tt lnode_create} function}
+
+ \indexfunc{lnode_create}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *lnode_create(void *);\end{verbatim}
+
+ \description
+
+ The \verb|lnode_create| function dynamically allocates a list node,
+ stores in it the data value specified in the argument and
+ returns a pointer to it. The allocation is performed by a call to the
+ standard \verb|malloc| function. If the allocation fails, a null
+ pointer is returned.
+
+\subsubsection{The {\tt lnode_destroy} function}
+
+ \indexfunc{lnode_destroy}
+ \synopsis
+ \begin{verbatim}
+ void lnode_destroy(lnode_t *);\end{verbatim}
+
+ \description
+
+ The \verb|lnode_destroy| function destroys a list node that has been
+ allocated with the \verb|lnode_create| function. The value of any pointer
+ that referred to the node that was thus freed is indeterminate.
+
+ If the node is currently the occupant of a list, the behavior is undefined
+ if the list is subsequently used.
+
+\subsubsection{The {\tt lnode_get} function}
+
+ \indexfunc{lnode_get}
+ \synopsis
+ \begin{verbatim}
+ void *lnode_get(lnode_t *);\end{verbatim}
+
+ \description
+
+ The \verb|lnode_get| function retrieves the \verb|void *| data value
+ associated with a node.\footnote{This is the {\bf only} interface for
+ retrieving the data element.}
+
+\subsubsection{The {\tt lnode_init} function}
+
+ \indexfunc{lnode_init}
+ \synopsis
+ \begin{verbatim}
+ lnode_t *lnode_init(lnode_t *, void *);\end{verbatim}
+
+ The \verb|lnode_init| function initializes the contents
+ of the specified list node object, assigning it the
+ data value specified as the second argument.
+ The first argument is a pointer which refers to
+ a data object that has a suitable size and alignment
+ for the representation of an \verb|lnode_t| type.
+ After initialization with \verb|lnode_init|, the object is subsequently
+ eligible as an operand to the functions of the List component.
+
+\subsubsection{The {\tt lnode_is_in_a_list} function}
+
+ \indexfunc{lnode_is_in_a_list}
+ \synopsis
+ \begin{verbatim}
+ int lnode_is_in_a_list(lnode_t *);\end{verbatim}
+
+ \description
+
+ The \verb|lnode_is_in_a_list| function determines whether the given node is
+ an occupant of some list. If the node is in a list, the function returns
+ the value $1$. If the node is not in any list, the return value is zero.
+
+\subsubsection{The {\tt lnode_pool_create} function}
+
+ \indexfunc{lnode_pool_create}
+ \synopsis
+ \begin{verbatim}
+ lnodepool_t *lnode_pool_create(listcount_t);\end{verbatim}
+
+ \constraints
+
+ The value of the argument shall not be zero.
+
+ \description
+
+ The \verb|lnode_pool_create| function dynamically allocates,
+ by means of the standard library function \verb|malloc|
+ a node pool object containing the number of nodes specified
+ as the first argument. If not enough resources are available,
+ a null pointer is returned, otherwise a pointer to the
+ \verb|lnodepool_t| object is returned.
+
+\subsubsection{The {\tt lnode_pool_destroy} function}
+
+ \indexfunc{lnode_pool_destroy}
+ \synopsis
+ \begin{verbatim}
+ void lnode_pool_destroy(lnodepool_t *);\end{verbatim}
+
+ \description
+
+ The \verb|lnode_pool_destroy| function deallocates a
+ node pool that was allocated by \verb|lnode_pool_create|.
+ The value of any pointer which referred to the
+ node pool object becomes indeterminate.
+
+\subsubsection{The {\tt lnode_pool_init} function}
+
+ \indexfunc{lnode_pool_init}
+ \synopsis
+ \begin{verbatim}
+ lnodepool_t *lnode_pool_init(lnodepool_t *,
+ lnode_t *, listcount_t);\end{verbatim}
+
+ \constraints
+
+ The third argument, which specifies the node count, shall not be zero.
+
+ \description
+
+ The \verb|lnode_pool_init| function initializes a data object
+ that has a suitable size and alignment to represent an
+ \verb|lnodepool_t| type. A pointer to this object is passed
+ as the first argument. The node pool thus created draws nodes
+ from an array specified by the second argument, which shall be a pointer to
+ an object that can behave like an array of \verb|lnode_t| objects.
+ The third argument specifies the number of elements in this array.
+
+ After this function, the object pointed at by the \verb|lnodepool_t *|
+ argument is eligible for use with the node pool management functions
+ of the List component. Nodes may be drawn from the pool and returned to it.
+
+ As long as the pool continues to be used, the program should not directly
+ manipulate the node array. In particular, if the program modifies any
+ part of the array, then the behavior is undefined if the
+ \verb|lnodepool_t| object or any nodes drawn from it are subsequently
+ passed to a List function. The program shall not directly use the array
+ elements as independent \verb|lnode_t| objects while the array is
+ associated with the pool; in particular, it shall not pass these elements
+ to Kazlib functions that operate on \verb|lnode_t|.
+
+ The behavior is undefined if the same array is associated with more than
+ one node pool object, or if two node pool objects are given overlapping
+ arrays.
+
+ The node array is managed in an manner that is specific to the
+ implementation; the intent is that each element of the array represents a
+ distinct node object, a pointer to which can be returned in response to an
+ allocation request.
+
+ The \verb|lnode_pool_init| function returns a copy of the first argument.
+
+\subsubsection{The {\tt lnode_pool_isempty} function}
+
+ \indexfunc{lnode_pool_isempty}
+ \synopsis
+ \begin{verbatim}
+ int lnode_pool_isempty(lnodepool_t *);\end{verbatim}
+
+ \description
+
+ The \verb|lnode_pool_isempty| function tests the
+ specified \verb|lnodepool_t| object for ability to supply nodes.
+ If the object has been
+ subject to so many requests that it is no longer capable of
+ of supplying additional list nodes, the value $1$ is returned.
+ Otherwise the return value returned is zero.
+
+\subsubsection{The {\tt lnode_pool_isfrom} function}
+
+ \indexfunc{lnode_pool_isfrom}
+ \synopsis
+ \begin{verbatim}
+ int lnode_pool_isfrom(lnodepool_t *, lnode_t *);\end{verbatim}
+
+ \description
+
+ The function \verb|lnode_pool_isfrom|, intended to serve as a software
+ verification aid, determines whether a list node originates from
+ a particular node pool. The return value is $1$ if this relationship is
+ true, otherwise zero.
+
+\subsubsection{The {\tt lnode_put} function}
+
+ \indexfunc{lnode_put}
+ \synopsis
+ \begin{verbatim}
+ void lnode_put(lnode_t *, void *);\end{verbatim}
+
+ \description
+
+ The function \verb|lnode_put| replaces the data element
+ associated with the list node.
+
+\subsubsection{The {\tt lnode_return} function}
+
+ \indexfunc{lnode_return}
+ \synopsis
+ \begin{verbatim}
+ void lnode_return(lnodepool_t *, lnode_t *);\end{verbatim}
+
+ \constraints
+
+ The node pointed at by the second argument was derived by an allocation
+ request from the pool pointed at by the first argument.\footnote{In
+ other words, the {\tt lnode_pool_isfrom} function, were it called with
+ the same two arguments, would return $1$ if this constraint is met.}
+
+ Furthermore, the node must not be the occupant of a list.
+
+ \description
+
+ The \verb|lnode_return| function returns a node back to the node pool from
+ which it came. The node must not be subsequently used as an argument to any
+ List functions, until it happens to be allocated again. The pointer to
+ the node object remains valid, and may be returned by a subsequent
+ allocation request from the same node pool.
+
+\subsection{Implementation}
+\index{List!reference implementation}
+
+This section describes the elements of the reference implementation of the
+List component. No requirement is imposed that an implementation should
+follow the reference implementation. The same is true of the
+implementation notes for the other components.
+
+\subsubsection{Types}
+\index{implementation!List types}
+\index{typedefs!implementation of List}
+
+The reference List implementation is a doubly-linked circular list
+\index{sentinel node!of linked list}
+with a {\it sentinel node}. The node structure type is defined like this:
+\begin{verbatim}
+ typedef struct lnode_t {
+ struct lnode_t *list_next;
+ struct lnode_t *list_prev;
+ void *list_data;
+ } lnode_t;
+\end{verbatim}
+and the list structure is defined like this:
+\begin{verbatim}
+ typedef struct list_t {
+ lnode_t list_nilnode;
+ listcount_t list_nodecount;
+ listcount_t list_maxcount;
+ } list_t;
+\end{verbatim}
+The \verb|list_nilnode| member of the list object is the sentinel. It is
+always present in the list, never deleted. When the list is empty, the sentinel
+node's \verb|list_next| and \verb|list_prev| pointers simply point back at the sentinel
+node. The \verb|list_maxcount| member of the list tells how many nodes may be
+inserted and \verb|list_nodecount| keeps track of the actual count.
+
+The reason the sentinel node is called \verb|list_nilnode| is that it
+acts as the successor of a list's tail node, if there is one,
+and as the predecessor of the first node. In a linked list implementation
+that does not use a sentinel node, the \verb|list_next| pointer of
+the the tail node and the \verb|list_prev| pointer of the first node would
+be null.
+
+Note that prefixed names are used for all of the structure members. This is so
+that the header file conforms to the documented namespace. If, for example, the
+\verb|list_nilnode| member were simply called \verb|nilnode|, then
+if the program contained somewhere a macro called \verb|nilnode|, there would
+be a potential clash. If the program defined \verb|nilnode| prior to including
+the \verb|list.h| header, the declaration of \verb|struct list_t| would
+be confounded. If the program defined \verb|nilnode| after
+including \verb|list.h|, the definition would interfere with \verb|list.h|
+macros whose replacement text refers to the \verb|nilnode| member.
+
+For programming convenience, the list implementation source file defines short
+macro names for the structure members:
+\begin{verbatim}
+ #define next list_next
+ #define prev list_prev
+ #define data list_data
+\end{verbatim}
+... and so forth. These names are private to the translation unit, which
+includes only standard ANSI C headers. Some of the examples in this section
+make use of the short names; it is assumed that these macros are in effect.
+
+\subsubsection{Selected operations}
+\index{implementation!List operations}
+
+\paragraph{Retrieving the first node}
+\index{List!first node}
+
+Given a pointer \verb|P| to a \verb|list_t| type, the \verb|list_first|
+function examines the value of \verb|P->nilnode.next| which points
+at the head node if the list is not empty. If the list is empty,
+then this expression points back at the sentinel node. In
+other words, the comparison
+\begin{verbatim}
+ P->nilnode.next == &P->nilnode
+\end{verbatim}
+yields true when the list is empty. In this case, the interface requires that
+a null pointer be returned by \verb|list_first|. The implementation actually
+uses the above test, through a test for \verb|P->nodecount| being equal to
+zero is also possible.
+
+In general, any operation which produces a pointer to the nilnode that must be
+returned back to the calling program must test for that case and return a null
+pointer instead to satisfy the interface requirements.
+
+\paragraph{Node deletion}
+\index{List!deletion}
+
+Thanks to the use of the sentinel node, the list deletion operation doesn't
+have to test for special cases. A node in the middle of the list is
+deleted in exactly the same way as the first or the last node:
+\begin{verbatim}
+ lnode_t *list_delete(list_t *list, lnode_t *del)
+ {
+ lnode_t *next = del->next;
+ lnode_t *prev = del->prev;
+
+ assert (list_contains(list, del));
+
+ prev->next = next;
+ next->prev = prev;
+ list->nodecount--;
+
+ del->next = del->prev = NULL;
+
+ return del;
+ }
+\end{verbatim}
+Quite simply, the successor and predecessor of the deleted node are connected
+together so that the deleted node is spliced out from the list. If the node is
+the last remaining one, then the sentinel node serves as both the successor and
+the predecessor. The effect of the deletion then is to set the sentinel's next
+and previous links to point to itself, as they did initially when the list was
+previously empty.
+
+The next and prev pointers are set to null not only for enhanced error checking
+in language implementations that trap dereferences of null pointers,
+but also to indicate that the node is not on any list. The interface
+function \verb|lnode_is_in_a_list| makes use of this.
+
+It's worth discussing in some detail why the values of expressions
+\verb|del->next| and \verb|del->prev| are cached in local variables. The
+actual statements that splice the node out of the list could instead have been
+written:
+\begin{verbatim}
+ del->prev->next = del->next;
+ del->next->prev = del->prev;
+\end{verbatim}
+However, this causes some compilers to generate less than optimal code because
+they fail to apply common subexpression elimination to the double
+occurrence of \verb|del->next|. Caching this expression in a local variable
+helps to get better code by making the semantics more obvious. In any case,
+modern compilers tend to do a good job of caching locals in high speed storage,
+particularly on architectures generously endowed with registers, so using a few
+extra locals is unlikely to lead to worse target code. The principle of using
+local variables to perform ``manual CSE'' is applied throughout the Kazlib
+reference implementation.
+
+\paragraph{Node insertion}
+Node insertion is also simple, thanks to the sentinel node which makes
+the doubly linked list circular. All insertions are done using
+the functions \verb|list_ins_before| and \verb|list_ins_after|.
+These are very similar, so it suffices to show \verb|list_ins_before|:
+\begin{verbatim}
+ void list_ins_before(list_t *list, lnode_t *new, lnode_t *this)
+ {
+ lnode_t *that = this->prev;
+
+ assert (new != NULL);
+ assert (!list_contains(list, new));
+ assert (!lnode_is_in_a_list(new));
+ assert (this == list_nil(list) || list_contains(list, this));
+ assert (list->nodecount + 1 > list->nodecount);
+
+ new->next = this;
+ new->prev = that;
+ that->next = new;
+ this->prev = new;
+ list->nodecount++;
+
+ assert (list->nodecount <= list->maxcount);
+ }
+\end{verbatim}
+The node \verb|this| is the one before which the new node is being
+inserted. Internally, the pointer \verb|that| points to the
+node after which the insertion takes place. In other words, the function
+inserts the node \verb|new| in between \verb|this| and \verb|that|.
+
+Note the copious assertions which verify all of the documented constraints:
+that the node is not already on the list, or any other list, that the reference
+node \verb|this| is in the list, and that the list capacity won't be exceeded,
+and that the node count doesn't overflow its type.
+
+\index{List!insertion}
+
+\section{Hash component}
+
+The Hash component provides a means to manage collections of elements, called
+hashes, that are not ordered. Each element in the collection has a unique key,
+which is used for searching and inserting. The intent is that the
+implementation is based on extendible hashing, and the interface allows for
+user-defined hashing functions. The number of elements that can be stored
+in a hash is limited; maximum number of entries in a hash is known as its
+{\it capacity}.
+
+\subsection{Interface}
+
+\subsubsection{The {\tt hash.h} header}
+
+Each C or C++ translation unit that is to use the functionality of the Hash
+component shall include the header \verb|hash.h|. This header shall
+contain declarations of types and external functions, and definitions of
+macros. The following typedef names shall be
+defined:\index{Hash!typedef names}
+\index{typedefs!defined by Hash}
+\begin{verbatim}
+ hash_t hashcount_t
+ hnode_t hash_val_t
+ hash_comp_t hnode_alloc_t
+ hscan_t hnode_free_t
+ hash_fun_t
+\end{verbatim}
+In addition, the following structure tags may be defined:\index{Hash!tag names}
+\index{tags!defined by Hash}
+\begin{verbatim}
+ struct hash_t
+ struct hnode_t
+ struct hscan_t
+\end{verbatim}
+The following external function names shall be declared:
+\index{Hash!function names}\index{functions!defined by Hash}
+\begin{verbatim}
+ hash_create hash_count
+ hash_set_allocator hash_size
+ hash_destroy hash_isfull
+ hash_free_nodes hash_isempty
+ hash_init hash_scan_begin
+ hash_insert hash_scan_next
+ hash_lookup hash_scan_delete
+ hash_delete hash_scan_delfree
+ hash_alloc_insert hash_verify
+ hash_delete_free hnode_create
+ hnode_put hnode_init
+ hnode_get hnode_destroy
+ hnode_getkey hash_free
+\end{verbatim}
+\index{Hash!external objects}
+In addition, the external object name
+\begin{verbatim}
+ hash_val_t_bit
+\end{verbatim}
+shall be declared. The following preprocessor symbols (macros) shall be
+defined: \index{Hash!macro names}\index{macros!defined by Hash}
+\indexmacro{HASHCOUNT_T_MAX}
+\indexmacro{HASH_VAL_T_BIT}
+\indexmacro{HASH_VAL_T_MAX}
+\indexmacro{HASH_H}
+\begin{verbatim}
+ HASHCOUNT_T_MAX
+ HASH_VAL_T_BIT
+ HASH_H\end{verbatim}
+\index{symbols!reserved by Hash}\index{Hash!reserved symbols}
+Macro identifiers which begin with the upper-case prefix \verb|HASH| are
+reserved for future extensions to the \verb|hash.h| header, as are
+names in the ordinary and tag namespaces which begin with \verb|hash_|,
+\verb|hnode_| or \verb|hscan_|. External names which begin with \verb|hash_|,
+\verb|hnode_| or \verb|hscan_| are reserved by the Kazlib library regardless of
+what headers are included.
+
+\subsubsection{The {\tt hash_t} type}
+
+\indextype{hash_t}
+The type \verb|hash_t| is an opaque data type which maintains information about
+the current state of a single hash. From the programmer's viewpoint, a hash
+consists of an instance of the \verb|hash_t| type, plus zero or more instances
+of the type \verb|hnode_t|. An instance of the \verb|hash_t| type can be
+dynamically created using the \verb|hash_create| function, and destroyed by the
+\verb|hash_destroy| function. Alternately, the program can declare an object
+of type \verb|hash_t| and have it initialized via the \verb|hash_init|
+function. When initializing a hash this way, the user must also provide
+a fixed-size array of \verb|hnode_t *| objects which serves as the hash table.
+\footnote{A hash initialized this way does not support extendible hashing,
+because there is no mechanism for growing the user-supplied array.}
+
+\subsubsection{The {\tt hnode_t} type}
+
+\indextype{hnode_t}
+The \verb|hnode_t| type is an opaque type that represents a single element
+that can be inserted into a hash. A hash node contains a a reference to
+satellite data provided by the user. Nodes may be dynamically created by the
+\verb|hnode_create| function. Alternately, the program may supply an
+\verb|hnode_t| object that can be initialized by the \verb|hnode_init|
+function.
+
+\subsubsection{The {\tt hash_comp_t} type}
+
+\indextype{hash_comp_t}
+The \verb|hash_comp_t| type is a typedef name for the pointer-to-function type
+\begin{verbatim}
+ int (*)(const void *, const void *);
+\end{verbatim}
+In the context of the Hash component, this type denotes pointers to
+comparison functions.
+
+\subsubsection{The {\tt hscan_t} type}
+
+\indextype{hscan_t}
+The \verb|hscan_t| typedef stands for an opaque type which represents
+context information for traversing a hash. It is initialized by the
+\verb|hash_scan_begin| function, which specifies a hash to be
+traversed. Successive elements are retrieved using the \verb|hash_scan_next|
+function, which eventually indicates that no more elements
+remain. Inserting to, or deleting from a hash other than using
+the function \verb|hash_scan_delete| causes any \verb|hscan_t|
+objects that refer to it to become indeterminate.
+
+\subsubsection{The {\tt hashcount_t} type}
+
+\indextype{hashcount_t}
+\indexmacro{HASHCOUNT_T_MAX}
+This is an unsigned integral type which is capable of representing the number
+of nodes in a hash.
+The \verb|HASHCOUNT_T_MAX| macro expands to a
+constant expression of type \verb|hashcount_t| which specifies the maximum
+value of that type.
+
+\subsubsection{The {\tt hash_val_t} type}
+
+\indextype{hash_val_t}
+\indexmacro{HASH_VAL_T_MAX}
+The \verb|hash_val_t| type is an unsigned integral type capable of
+holding at least 32 bits. The purpose of this type is to represent the
+output values of hashing functions.
+The \verb|HASH_VAL_T_MAX| macro expands to a
+constant expression of type \verb|hash_val_t| which specifies the maximum
+value of that type.
+
+\subsubsection{The {\tt hnode_alloc_t} type}
+
+\index{Hash!allocator function}
+The \verb|hnode_alloc_t| identifier is a typedef name for the pointer-to-function
+type
+\begin{verbatim}
+ hnode_t *(*)(void *);
+\end{verbatim}
+In other words, a pointer to a function that takes a \verb|void *|
+argument and returns a pointer to \verb|hnode_t|.
+A function of this type which meets certain behavior criteria may be
+registered with a \verb|hash_t| object as node allocator, together
+with a compatible deallocator function. The \verb|void *| argument
+passes user-specified context information through to the
+allocator routines (see section \ref{section:hash_set_allocator}).
+
+\subsubsection{The {\tt hnode_free_t} type}
+
+\index{Hash!deallocator function}
+The \verb|hnode_free_t| identifier is a typedef name for the
+pointer-to-function type
+\begin{verbatim}
+ void (*)(hnode_t *, void *);
+\end{verbatim}
+A function of this type which meets certain behavior criteria may be
+registered with a \verb|hash_t| object as node deallocator
+together with a compatible allocator function.
+
+\subsubsection{The {\tt hash_fun_t} type}
+
+\index{hashing function}
+The \verb|hash_fun_t| identifier is a typedef name for the
+pointer-to-function type
+\begin{verbatim}
+ hash_val_t (*hash_fun_t)(const void *);
+\end{verbatim}
+A function of this type which behaves a certain way is called
+a {\it hashing function}. To be a viable hashing function, such
+a function must take a pointer to a key object, and produce
+an integer value that depends only on the contents of the key,
+and possibly on information that does not change over the lifetime of any hash
+for which that hashing function is used. Additional requirements for hashing
+functions are introduced later.
+
+\subsubsection{The {\tt hash_val_t_bit} object}
+
+ \indexobject{hash_val_t_bit}
+ \synopsis
+ \begin{verbatim}
+ extern int hash_val_t_bit;\end{verbatim}
+
+ \description
+
+ The \verb|hash_val_t_bit| object of type int has a fixed value
+ which counts the number of bits in the \verb|hash_val_t| object.
+ The program shall not store a value into this object.
+
+ The value of \verb|hash_val_t_bit| need not be correct until the
+ first successful call to \verb|hash_create| or to \verb|hash_init|
+ completes.
+
+ The implementation shall provide the macro \verb|HASH_VAL_T_BIT| which
+ expands to a non-lvalue expression that has the same value and type as the
+ object, but which may be a constant expression.\footnote{The intent of
+ providing these values is to ease the implementation of portable hashing
+ functions that take advantage of all of the available bits of a given
+ Kazlib implementation. Alternately, hashing functions may be constructed to
+ only use the lower 32 bits of the type.}
+
+\subsubsection{The {\tt hash_create} function}
+
+ \indexfunc{hash_create}
+ \index{Hash!creation of}
+ \index{create!hash object}
+ \synopsis
+ \begin{verbatim}
+ hash_t *hash_create(hashcount_t, hash_comp_t, hash_fun_t);\end{verbatim}
+
+ \description
+
+ If sufficient resources exist, the \verb|hash_create| function instantiates
+ and initializes an object of type \verb|hash_t| and returns a pointer to
+ it. Otherwise it returns a null pointer.
+
+ The first argument establishes the capacity of the hash, which is
+ initially empty.
+
+ The second argument is a pointer to a comparison function that will be
+ associated with the \verb|hash_t| object for its entire duration.
+
+ \index{hashing function}
+ The third argument is either null or a pointer to a hashing function
+ that is permanently associated with the object. If it is null, a {\it default
+ hashing function\/} is assigned by the implementation.
+
+ The hashing function shall be invoked with an argument that is one
+ of the keys that are being inserted into, or sought after, in the
+ hash. The hashing function must produce the same value each time it
+ is called for a given key. It is up to the hash user to define the
+ representation of keys, to manage their storage, and to provide a matching
+ hashing function. The hash stores only generic \verb|void *| pointers to
+ keys.
+
+ The default hashing function assumes that keys are null terminated
+ strings. That is to say, it behaves as though its \verb|void *|
+ argument points to the first elements of an array of \verb|unsigned|
+ \verb|char|, the last of which is a null character. The use of
+ the default hashing function with keys that do not have this representation
+ results in undefined behavior.
+
+\subsubsection{The {\tt hash_set_allocator} function}
+
+ \indexfunc{hash_set_allocator}
+ \label{section:hash_set_allocator}
+
+ \synopsis
+ \begin{verbatim}
+ void hash_set_allocator(hash_t *, hnode_alloc_t,
+ hnode_free_t, void *);\end{verbatim}
+
+ \constraints
+
+ The second and third arguments---the function pointers---shall either
+ both be null, or both be non-null. The hash pointed at by the first
+ argument shall be empty.
+
+ \description
+
+ When a hash is initialized, it is outfitted with a pair of default
+ node allocation functions. These functions may be replaced with functions
+ supplied by the program by calling the \verb|hash_set_allocator| function
+ and specifying two suitable pointers. If these pointers are null, the
+ default functions are restored.
+
+ These functions are called to allocate and free \verb|hnode_t|
+ objects by the functions \verb|hash_alloc_insert|
+ and \verb|hash_delete_free| (see sections
+ \ref{section:hash_delete_free} and \ref{section:hash_alloc_insert}).
+
+ If sufficient resources exist, the allocation function shall
+ return a pointer to a unique storage object that is large enough
+ and suitably aligned to represent an object of type \verb|dnode_t|.
+ Otherwise, the function shall return a null pointer.
+
+ The deallocation function shall be capable of disposing of the
+ objects created by the matching allocator function.
+
+
+\subsubsection{The {\tt hash_destroy} function}
+
+ \indexfunc{hash_destroy}
+ \synopsis
+ \begin{verbatim}
+ void hash_destroy(hash_t *);\end{verbatim}
+
+ \constraints
+
+ The hash pointed at by the first argument shall be empty.
+
+ \description
+
+ The \verb|hash_destroy| function deinitializes and deallocates a hash
+ that was created with \verb|hash_create|.
+ All pointers and \verb|hscan_t| objects that referred to the hash become
+ indeterminate.
+
+\subsubsection{The {\tt hash_free_nodes} function}
+
+ \indexfunc{hash_free_nodes}
+ \synopsis
+ \begin{verbatim}
+ void hash_free_nodes(hash_t *);\end{verbatim}
+
+ \description
+
+ The \verb|hash_free_nodes| function removes each node from
+ the hash and destroys it as if by calling \verb|hash_delete_free|
+ (Section \ref{section:hash_delete_free}). The order in which
+ the nodes are destroyed is unspecified.
+
+\subsubsection{The {\tt hash_free} function}
+
+ \indexfunc{hash_free}
+ \synopsis
+ \begin{verbatim}
+ void hash_free(hash_t *);\end{verbatim}
+
+ \description
+
+ Every node in the hash is removed from the hash and is then subject to the
+ deallocation function. The overall effect is as if the function
+ \verb|hash_delete_free| (Section \ref{section:hash_delete_free}) were
+ invoked on each node, and then \verb|hash_destroy| invoked on the
+ hash itself.
+
+ This function is obsolescent, and will be removed from some future revision
+ of this document.
+
+\subsubsection{The {\tt hash_init} function}
+
+ \indexfunc{hash_init}
+ \synopsis
+ \begin{verbatim}
+ hash_t *hash_init(hash_t *, hashcount_t, hash_comp_t,
+ hash_fun_t, hnode_t **, hashcount_t);
+ \end{verbatim}
+
+ \constraints
+
+ The last argument, which specifies the size of the program-supplied table,
+ shall be integral power of two that is greater than one---that is to say, an
+ integer of the form $2^k$ where $k$ is a positive integer.
+
+ \description
+
+ The \verb|hash_init| function configures the specified \verb|hash_t| object
+ to use a specified array of \verb|hnode_t *| pointer objects as a table.
+ The user is responsible for providing storage for the \verb|hash_t|
+ object and the array. As in the \verb|hash_create| interface,
+ the second parameter specifies the capacity, and the subsequent
+ arguments specify the comparison and hashing function, respectively.
+ The last two arguments specify the table of pointers. The array object
+ shall have at least as many elements as indicated by the last parameter,
+ otherwise the behavior is undefined. The call to \verb|hash_init| is said
+ to register the array with the hash.
+
+ The program shall not register the same array with more than one hash.
+ More specifically, once the program modifies a registered array, or
+ registers it with another hash, it must discontinue use of the first hash.
+ \footnote{Note that no explicit deinitialization function is provided to
+ dissociate the array. A program disposes of a hash created by
+ {\tt hash_init} by discontinuing its use.}
+
+\subsubsection{The {\tt hash_insert} function}
+
+ \indexfunc{hash_insert}
+ \label{section:hash_insert}
+ \synopsis
+ \begin{verbatim}
+ void hash_insert(hash_t *, hnode_t *, const void *);\end{verbatim}
+
+ \constraints
+ The hash is not full. The key specified by the \verb|void *| parameter
+ does not already exist in the specified hash. The node specified
+ by the second parameter is not already inserted into a hash.
+
+ \description
+ The \verb|hash_insert| function adds a new node to a hash. The user
+ must supply a node object that was initialized with \verb|hnode_init|
+ or dynamically created with \verb|hnode_create|. If the node is
+ already inserted into the same hash or any other hash, the behavior
+ is undefined.
+
+ A program may modify a key or node that has been inserted into a hash, or
+ cause the storage of the key or the node to become invalid. However, any
+ subsequent use of the hash invokes undefined behavior, with the following
+ exception: the data pointer stored within a node may be modified using the
+ \verb|hnode_put| function.
+
+ The \verb|hash_insert| function invokes the hashing function callback with
+ the key pointer as the argument.
+
+ The \verb|hash_insert| function may need to acquire additional storage in
+ order to support hash table growth. If the storage allocation fails, the
+ function shall fully recover, and insert the node without growing the
+ table.
+
+ The Hash implementation shall not modify the storage referenced by a key,
+ and shall not access it other than indirectly through the supplied hashing
+ and comparison functions.
+
+\subsubsection{The {\tt hash_lookup} function}
+
+ \indexfunc{hash_lookup}
+ \synopsis
+ \begin{verbatim}
+ hnode_t *hash_lookup(hash_t *, const void *);\end{verbatim}
+
+ \description
+
+ The \verb|hash_lookup| function searches the given hash for a node
+ matching the given key. Unless the hash is empty, the key shall be
+ compared against one or more keys that are already in the hash,
+ using the comparison function. The key pointer may
+ be identical to one that has already been inserted into the
+ hash.\footnote {In that case, the comparison function must correctly
+ cope with aliased parameters}.
+
+ If the key is found in the hash, a pointer to the corresponding node
+ is returned.\footnote{The corresponding node is the one that was specified
+ in the call to {\tt hash_insert} together with the matching key.}
+
+ If the key is not found, a null pointer is returned.
+
+\subsubsection{The {\tt hash_delete} function}
+
+ \indexfunc{hash_delete}
+ \synopsis
+ \begin{verbatim}
+ hnode_t *hash_delete(hash_t *, hnode_t *);\end{verbatim}
+
+ \constraints
+ The specified node is an occupant of the given hash.
+
+ \description
+ The \verb|hash_delete| function removes from the given hash a
+ node that has previously been inserted into it. The key under
+ which the node was inserted is also removed from the hash.\footnote{Thus
+ the program may arbitrarily manipulate the removed key without destroying
+ the integrity of the hash.}
+
+ Any existing \verb|hscan_t| iterator which is associated with the
+ hash becomes indeterminate.\footnote{To delete the current node during hash
+ table traversal, the {\tt hash_scan_delete} function must be used
+ instead.}
+
+
+\subsubsection{The {\tt hash_alloc_insert} function}
+
+ \label{section:hash_alloc_insert}
+ \indexfunc{hash_alloc_insert}
+
+ \synopsis
+ \begin{verbatim}
+ int hash_alloc_insert(hash_t *, const void *, void *);\end{verbatim}
+
+ \constraints
+
+ The second argument specifies the insertion key. The hash shall not
+ already contain this key.
+
+ \description
+
+ The \verb|hash_alloc_insert| function dynamically allocates and
+ initializes a \verb|hnode_t| object and inserts it into the
+ given hash. The second argument and third arguments are pointers
+ to user data and key objects, either of which may be null.
+
+ The allocation is performed by a call to the default allocation
+ function, or to the function that was configured using
+ \verb|hash_set_allocator| (Section \ref{section:hash_set_allocator}).
+
+ If the allocation succeeds, the insertion is performed and
+ the value 1 is returned. If the allocation fails, no insertion is
+ performed and 0 is returned.
+
+\subsubsection{The {\tt hash_delete_free} function}
+
+ \label{section:hash_delete_free}
+ \indexfunc{hash_delete_free}
+
+ \synopsis
+ \begin{verbatim}
+ void hash_delete_free(hash_t *, hnode_t *)
+ \end{verbatim}
+
+ \constraints
+ The given node can be found within the given hash.
+
+ \description
+ The \verb|hash_delete_free| function is the reverse of
+ \verb|hash_alloc_insert|. It removes the given node form the
+ hash as if by a call to \verb|hash_delete| and then deletes it using the
+ default or user-defined allocator (Section
+ \ref{section:hash_set_allocator}). If the given node had not been created
+ using \verb|hash_alloc_insert|, the behavior is undefined.
+
+\subsubsection{The {\tt hnode_put} function}
+
+ \indexfunc{hnode_put}
+ \synopsis
+ \begin{verbatim}
+ void hnode_put(hnode_t *, void *);\end{verbatim}
+
+ \description
+ The function \verb|hnode_put| replaces the data element
+ associated with the hash node.
+
+\subsubsection{The {\tt hnode_get} function}
+
+ \indexfunc{hnode_get}
+ \synopsis
+ \begin{verbatim}
+ void *hnode_get(hnode_t *);\end{verbatim}
+
+ \description
+ The \verb|hnode_get| function retrieves the \verb|void * | data value
+ associated with the given hash node.
+
+\subsubsection{The {\tt hnode_getkey} function}
+
+ \indexfunc{hnode_getkey}
+ \synopsis
+ \begin{verbatim}
+ const void *hnode_getkey(hnode_t *);\end{verbatim}
+
+ \description
+
+ The \verb|hnode_getkey| function retrieves the \verb|void *| key value
+ associated with the given node. A node acquires an associated key
+ when it is inserted into a hash (see section \ref{section:hash_insert}).
+ Invoking \verb|hnode_getkey| on a node that has not been inserted
+ into a hash results in undefined behavior.
+
+\subsubsection{The {\tt hash_count} function}
+
+ \indexfunc{hash_count}
+ \synopsis
+ \begin{verbatim}
+ hashcount_t hash_count(hash_t *);\end{verbatim}
+
+ \description
+ The \verb|hash_count| function returns a value which represents the number
+ of nodes currently stored in the hash pointed at by the argument.
+
+\subsubsection{The {\tt hash_size} function}
+
+ \indexfunc{hash_size}
+ \synopsis
+ \begin{verbatim}
+ hashcount_t hash_size(hash_t *hash)\end{verbatim}
+
+ \description
+ The \verb|hash_size| function returns an implementation-defined value that
+ depends on the number of entries in the given hash. The intent is that the
+ value represent the size of the internal hash table managed by the given
+ hash.
+
+\subsubsection{The {\tt hash_isfull} function}
+
+ \indexfunc{hash_isfull}
+ \synopsis
+ \begin{verbatim}
+ int hash_isfull(hash_t *);\end{verbatim}
+
+ \description
+ The \verb|hash_isfull| function returns 1 if the hash is full,
+ otherwise it returns 0.
+
+ If the argument is an expression with side effects, the behavior is
+ undefined.\index{macros!and side effects}
+
+
+\subsubsection{The {\tt hash_isempty} function}
+
+ \indexfunc{hash_isempty}
+ \synopsis
+ \begin{verbatim}
+ int hash_isempty(hash_t *);\end{verbatim}
+
+ \description
+ The \verb|hash_isempty| function returns 1 if the given hash is empty,
+ otherwise it returns 0.
+
+\subsubsection{The {\tt hash_scan_begin} function}
+
+ \indexfunc{hash_scan_begin}
+ \synopsis
+ \begin{verbatim}
+ void hash_scan_begin(hscan_t *, hash_t *);\end{verbatim}
+
+ \description
+ The \verb|hash_scan_begin| initializes the \verb|hscan_t| iterator object,
+ preparing it for a traversal of the given hash.
+
+ After this initialization, if the hash is modified in any way by
+ the performance of an insertion or deletion operation, the
+ value of the \verb|hscan_t| object becomes indeterminate,
+ with one exception: the \verb|hash_scan_delete| function or the
+ \verb|hash_scan_delfree| function may be used to delete the current
+ node.
+
+\subsubsection{The {\tt hash_scan_next} function}
+
+ \indexfunc{hash_scan_next}
+ \synopsis
+ \begin{verbatim}
+ hnode_t *hash_scan_next(hscan_t *);\end{verbatim}
+
+ \description
+ If any unvisited nodes remain, the \verb|hash_scan_next| function advances
+ to the next one and returns a pointer to it. Otherwise, it returns a null
+ pointer. Repeated invocations of \verb|hash_scan_next| return a pointer to
+ every node that has been inserted into the table, in no particular order,
+ such that no node is reported twice.
+
+\subsubsection{The {\tt hash_scan_delete} function}
+
+ \indexfunc{hash_scan_delete}
+ \synopsis
+ \begin{verbatim}
+ hnode_t *hash_scan_delete(hash_t *, hnode_t *);
+ \end{verbatim}
+
+ \constraints
+ The specified node is an occupant of the given hash.
+
+ \description
+ This function is almost exactly like \verb|hash_delete| except that it may
+ be used to delete a node that has been most recently obtained from
+ \verb|hash_scan_next| without destroying the validity of the \verb|hscan_t|
+ iterator from which the node was obtained.
+
+\subsubsection{The {\tt hash_scan_delfree} function}
+
+ \label{section:hash_scan_delfree}
+ \indexfunc{hash_scan_delfree}
+
+ \synopsis
+ \begin{verbatim}
+ void hash_scan_delfree(hash_t *, hnode_t *)
+ \end{verbatim}
+
+ \constraints
+ The given node can be found within the given hash.
+
+ \description
+ The \verb|hash_scan_delfree| function is similar to
+ \verb|hash_delete_free|. It removes the given node form the
+ hash and then deletes it using the default or user-defined allocator
+ (Section \ref{section:hash_set_allocator}). If the given node
+ had not been created using \verb|hash_alloc_insert|, the behavior
+ is undefined.
+
+ The deletion from the hash is performed as if by a call to
+ \verb|hash_scan_delete|, thus it is safe to delete a node that
+ was most recently obtained from a \verb|hash_scan_next| without
+ destroying the validity of the \verb|hscan_t| iterator.
+
+\subsubsection{The {\tt hash_verify} function}
+
+ \indexfunc{hash_verify}
+ \synopsis
+ \begin{verbatim}
+ int hash_verify(hash_t *hash);\end{verbatim}
+
+ \description
+ The intent of the \verb|hash_verify| function is to perform a verification
+ on the hash object, regardless of whether the Kazlib implementation is
+ operated in verification or production mode. If the hash object
+ and its constituent nodes have been correctly manipulated, and the
+ program has not caused any undefined behaviors, the value $1$ is returned.
+ Otherwise, the function may be able to, but is not guaranteed to, detect
+ corruption, and return the value zero.
+
+\subsubsection{The {\tt hnode_create} function}
+
+ \indexfunc{hnode_create}
+ \synopsis
+ \begin{verbatim}
+ hnode_t *hnode_create(void *);\end{verbatim}
+
+ \description
+ The \verb|hnode_create| function dynamically allocates a hash node,
+ stores in it the data value specified in the argument and
+ returns a pointer to it. The allocation is performed by a call to the
+ standard \verb|malloc| function. If the allocation fails, a null
+ pointer is returned.
+
+ The node's key pointer remains indeterminate until it is the subject of a
+ \verb|hash_insert| operation.
+
+\subsubsection{The {\tt hnode_init} function}
+
+ \indexfunc{hnode_init}
+ \synopsis
+ \begin{verbatim}
+ hnode_t *hnode_init(hnode_t *, void *);\end{verbatim}
+
+ \description
+ The \verb|hnode_init| function initializes the contents
+ of the specified hash node object, assigning it the
+ data value specified as the second argument.
+ The first argument is a pointer which refers to
+ a data object that has a suitable size and alignment
+ for the representation of an \verb|hnode_t| type.
+ After initialization with \verb|hnode_init|, the object is subsequently
+ eligible as an operand to the functions of the hash component,
+ other than \verb|hnode_getkey|.
+
+ The node's key pointer remains indeterminate until it is the subject of a
+ \verb|hash_insert| operation.
+
+\subsubsection{The {\tt hnode_destroy} function}
+
+ \indexfunc{hnode_destroy}
+ \synopsis
+ \begin{verbatim}
+ void hnode_destroy(hnode_t *);\end{verbatim}
+
+ \description
+ The \verb|hnode_destroy| function destroys a hash node that has been
+ allocated with the \verb|hnode_create| function. The value of any pointer
+ that referred to the node that was thus freed is indeterminate.
+
+ If the node is currently the occupant of a hash, the behavior is undefined
+ if the hash is subsequently used.
+
+\subsection{Implementation}
+
+TODO
+
+\section{Dictionary component}
+
+\index{Dictionary}
+The Dictionary component provides a means to manage ordered sequences of
+elements, having the following properties:
+\begin{enumerate}
+\item If the dictionary is not empty, a first and last element can be identified.
+ In a dictionary having only one element, that one element is both the first and
+ last element.
+\item Each element that is not the last element has another element as its
+ {\it successor}.
+ \index{successor!of a dictionary element}
+ \index{Dictionary!successor of an element}
+\item Each element that is not the first element has a {\it predecessor}.
+ \index{predecessor!of a dictionary element}
+ \index{Dictionary!predecessor of an element}
+\item No element is the predecessor or successor of more than one element.
+\item If one element is the successor of another, the other is necessarily the
+ predecessor of the first.
+\item Each element is associated with a piece of information known as
+ the key. The sequence is ordered according to the relation imposed
+ by the comparison function: the key of an element compares
+ greater than or equal to the key of its predecessor.
+\item If duplicate keys are present, then elements
+ having the same key form a subsequence with no other keys in it, which
+ follows from the previous property. No additional ordering is imposed
+ within such subsequences.
+\item Each element is associated with arbitrary satellite data.
+\end{enumerate}
+
+The Dictionary component supports efficient operations over such ordered
+sequences: such as insertion, deletion, ordered traversal, as well as exact and
+range searches.\footnote{The implicit association of keys and satellite data,
+together with the ability of efficiently search by key to retrieve data, gives
+rise to the term {\it dictionary}. A dictionary need not be ordered; a hash can
+therefore also be considered to be a kind of dictionary; the Kazlib
+nomenclature is somewhat unfortunate in that regard.}
+
+The number of elements that can be stored in a dictionary is limited; maximum
+number of entries in a dictionary is known as its {\it capacity}.
+
+\subsection{Interface}
+
+\subsubsection{The {\tt dict.h} header}
+
+Each C or C++ translation unit that is to use the functionality of the Dict
+component shall include the header \verb|dict.h|. This header shall
+contain declarations of types and external functions, and definitions of
+macros. The following typedef names shall be
+defined:\index{Dict!typedef names}
+\index{typedefs!defined by Dict}
+\begin{verbatim}
+ dict_t dnode_process_t
+ dnode_t dnode_alloc_t
+ dictcount_t dnode_free_t
+ dict_comp_t dict_load_t
+\end{verbatim}
+In addition, the following structure tags may be defined:\index{Dict!tag names}
+\index{tags!defined by Dict}
+\begin{verbatim}
+ struct dict_t
+ struct dnode_t
+\end{verbatim}
+The following external function names shall be declared:
+\index{Dict!function names}\index{functions!defined by Dict}
+\begin{verbatim}
+ dict_create dict_count
+ dict_set_allocator dict_isempty
+ dict_destroy dict_isfull
+ dict_free_nodes dict_contains
+ dict_init dict_allow_dupes
+ dict_verify dnode_is_in_a_dict
+ dict_lookup dnode_create
+ dict_lower_bound dnode_init
+ dict_upper_bound dnode_destroy
+ dict_insert dnode_get
+ dict_delete dnode_getkey
+ dict_alloc_insert dnode_put
+ dict_delete_free dict_process
+ dict_first dict_load_begin
+ dict_last dict_load_next
+ dict_next dict_load_end
+ dict_prev dict_free
+\end{verbatim}
+The following preprocessor symbols shall be
+defined: \index{Dict!macro names}\index{macros!defined by Dict}
+\indexmacro{DICTCOUNT_T_MAX}
+\indexmacro{DICT_H}
+\begin{verbatim}
+ DICTCOUNT_T_MAX
+ DICT_H\end{verbatim}
+\index{symbols!reserved by Dict}\index{Dict!reserved symbols}
+Macro identifiers which begin with the upper-case prefix \verb|DICT| are
+reserved for future extensions to the \verb|dict.h| header, as are
+names in the ordinary and tag namespaces which begin with \verb|dict_|
+or \verb|dnode_|. External names which begin with \verb|dict_|
+or \verb|dnode_| are reserved by the Kazlib library regardless of
+what headers are included.
+
+\subsubsection{The {\tt dict_t} type}
+
+\indextype{dict_t}
+The type \verb|dict_t| is an opaque data type which represents a single
+dictionary. A dictionary consists of an instance of the \verb|dict_t| type,
+plus zero or more instances of the type \verb|dnode_t|. An object of type
+\verb|dict_t| can be initialized by the \verb|dict_init| function. Alternately,
+the \verb|dict_create| function will dynamically allocate and initialize a
+dictionary. An empty dictionary created by \verb|dict_create| may be disposed
+of using \verb|dict_destroy|.
+
+\subsubsection{The {\tt dnode_t} type}
+
+\indextype{dnode_t}
+The \verb|dnode_t| type represents a single entry in a dictionary called a
+dictionary node. The object stores a pointer to user data, and a key pointer
+that is assigned to the dictionary node at the time when it is inserted into
+the dictionary. A \verb|dnode_t| may be dynamically created using
+\verb|dnode_create| and destroyed using \verb|dnode_destroy|. Alternately,
+the program may supply storage for a \verb|dnode_t| object and initialize
+it using the \verb|dnode_init| function.
+
+\subsubsection{The {\tt dictcount_t} type}
+
+\indextype{dictcount_t}
+\indexmacro{DICTCOUNT_T_MAX}
+This is an unsigned integral type which is capable of representing the number
+of nodes in a dictionary. The \verb|DICTCOUNT_T_MAX| macro expands to a
+constant expression of type \verb|dictcount_t| which specifies the maximum
+value of that type.
+
+\subsubsection{The {\tt dict_comp_t} type}
+
+\indextype{dict_comp_t}
+The \verb|dict_comp_t| type is a typedef name for the pointer-to-function type
+\begin{verbatim}
+ int (*)(const void *, const void *);
+\end{verbatim}
+In the context of the Dictionary component, this type denotes pointers to
+comparison functions.
+
+\subsubsection{The {\tt dnode_process_t} type}
+
+\indextype{dnode_process_t}
+The type \verb|dnode_process_t| is a typedef name for the pointer-to-function type
+\begin{verbatim}
+ void (*)(dict_t *, dnode_t *, void *);
+\end{verbatim}
+In the context of the Dictionary component, this is the type of a
+dictionary node processing function (See section \ref{section:dict_process}).
+The first two parameters identify a dictionary and the node within that
+dictionary that is being processed. The third argument is a context pointer.
+
+\subsubsection{The {\tt dnode_alloc_t} type}
+
+\indextype{dnode_alloc_t}
+The type \verb|dnode_alloc_t| is a typedef name for the pointer-to-function type
+\begin{verbatim}
+ dnode_t *(*)(void *);
+\end{verbatim}
+A function compatible with this type which meets certain other criteria may be
+registered with a \verb|dict_t| object as a node allocator function
+(See section \ref{section:dict_set_allocator}).
+
+\subsubsection{The {\tt dnode_free_t} type}
+
+\indextype{dnode_free_t}
+The type \verb|dnode_free_t| is a typedef name for the pointer-to-function type
+\begin{verbatim}
+ void (*)(dnode_t *, void *);
+\end{verbatim}
+A function compatible with this type which meets certain other criteria may be
+registered with a \verb|dict_t| object as a node deallocator function.
+(See section \ref{section:dict_set_allocator}).
+
+\subsubsection{The {\tt dict_load_t} type}
+
+\indextype{dict_load_t}
+
+The \verb|dict_load_t| type is opaque, and represents a context structure
+used during the process of constructing a dictionary from an ordered list
+of nodes. (See sections \ref{section:dict_load_begin} to
+\ref{section:dict_load_end}).
+
+\subsubsection{The {\tt dict_create} function}
+
+ \indexfunc{dict_create}
+ \index{Dictionary!creation of}
+ \index{create!dictionary object}
+
+ \synopsis
+ \begin{verbatim}
+ dict_t *dict_create(dictcount_t, dict_comp_t);\end{verbatim}
+
+ \description
+ The \verb|dict_create| function allocates a new
+ object of type \verb|dict_t| and initializes it to act as
+ a dictionary.
+
+ If insufficient resources exist for the allocation,
+ a null pointer is returned, otherwise a pointer to the dictionary
+ is returned.
+
+ The first argument specifies the capacity of the dictionary,
+ which is initially empty.
+
+ The second argument is a comparison function that is used for comparing
+ keys during insertion and searching operations, and is associated
+ with the dictionary for its entire duration.
+
+\subsubsection{The {\tt dict_set_allocator} function}
+
+ \label{section:dict_set_allocator}
+ \indexfunc{dict_set_allocator}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_set_allocator(dict_t *, dnode_alloc_t,
+ dnode_free_t, void *);\end{verbatim}
+
+ \constraints
+
+ The second and third arguments---the function pointers---shall either
+ both be null, or both be non-null. The dictionary pointed at by the first
+ argument shall be empty.
+
+ \description
+
+ When a dictionary is initialized, it is outfitted with a pair of default
+ node allocation functions. These functions may be replaced with functions
+ supplied by the program by calling the \verb|dict_set_allocator| function
+ and specifying two suitable pointers. If these pointers are null, the
+ default functions are restored.
+
+ These functions are called to allocate and free \verb|dnode_t|
+ objects by the functions \verb|dict_alloc_insert|
+ and \verb|dict_delete_free| (see sections
+ \ref{section:dict_delete_free} and \ref{section:dict_alloc_insert}).
+
+ If sufficient resources exist, the allocation function shall
+ return a pointer to a unique storage object that is large enough
+ and suitably aligned to represent an object of type \verb|dnode_t|.
+ Otherwise, the function shall return a null pointer.
+
+ The deallocation function shall be capable of disposing of the
+ objects created by the matching allocator function.
+
+\subsubsection{The {\tt dict_destroy} function}
+
+ \indexfunc{dict_destroy}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_destroy(dict_t *);\end{verbatim}
+
+ \constraints
+
+ The dictionary pointed at by the first argument shall be empty.
+
+ \description
+
+ The \verb|dict_destroy| function deinitializes and deallocates a dictionary
+ object that was created by \verb|dict_create|. All pointers that
+ referred to the dictionary become indeterminate.
+
+\subsubsection{The {\tt dict_free_nodes} function}
+
+ \indexfunc{dict_free_nodes}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_free_nodes(dict_t *);\end{verbatim}
+
+ \description
+
+ Every node in the dictionary is removed from the dictionary and is then
+ subject to the deallocation function, as if the function
+ \verb|dict_delete_free| (Section \ref{section:dict_delete_free}) were
+ invoked on each node, in some unspecified order.
+
+\subsubsection{The {\tt dict_free} function}
+
+ \indexfunc{dict_free}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_free(dict_t *);\end{verbatim}
+
+ \description
+
+ This function is obsolescent, and will be removed from some future revision
+ of this document. It is equivalent to \verb|dict_free_nodes|.
+
+\subsubsection{The {\tt dict_init} function}
+
+ \indexfunc{dict_init}
+
+ \synopsis
+ \begin{verbatim}
+ dict_t *dict_init(dict_t *, dictcount_t, dict_comp_t);\end{verbatim}
+
+ \description
+
+ The \verb|dict_init| function prepares specified \verb|dict_t| object
+ to behave as a dictionary that may subsequently be used with the other
+ dictionary functions.
+
+ The first argument points to the \verb|dict_t| object to be initialized.
+ The second argument specifies the capacity of the dictionary. The third
+ argument is a pointer to the comparison function which shall be associated
+ with the dictionary for its entire duration.
+
+\subsubsection{The {\tt dict_verify} function}
+
+ \indexfunc{dict_verify}
+
+ \synopsis
+ \begin{verbatim}
+ int dict_verify(dict_t *);\end{verbatim}
+
+ \description
+
+ The intent of the \verb|dict_verify| function is to perform a verification
+ on the dictionary object, regardless of whether the Kazlib implementation
+ is operated in verification or production mode. If the dictionary object
+ and its constituent nodes have been correctly manipulated, and the program
+ has not caused any undefined behaviors, the value $1$ is returned.
+ Otherwise, the function may be able to, but is not guaranteed to, detect
+ corruption, and return the value zero.
+
+\subsubsection{The {\tt dict_lookup} function}
+
+ \indexfunc{dict_lookup}
+
+ \synopsis
+ \begin{verbatim}
+ dnode_t *dict_lookup(dict_t *, const void *);\end{verbatim}
+
+ \description
+ The \verb|dict_lookup| function searches the given dictionary for a node
+ matching the given key. Unless the dictionary is empty, the key shall be
+ compared against one or more keys that are already in the dictionary, using
+ the comparison function. The key pointer may be identical to one that has
+ already been inserted into the dictionary.
+
+ If the key is found in the dictionary, a pointer to the corresponding node
+ is returned.
+
+ If the key is not found, a null pointer is returned.
+
+ If the dictionary contains more than one key which matches the search
+ key, then the first key in the subsequence of duplicate keys is returned.
+
+\subsubsection{The {\tt dict_lower_bound} function}
+
+ \indexfunc{dict_lower_bound}
+
+ \synopsis
+ \begin{verbatim}
+ dnode_t *dict_lower_bound(dict_t *, const void *);\end{verbatim}
+
+ \description
+
+ The \verb|dict_lower_bound| function searches the dictionary in a manner
+ similar to \verb|dict_lookup|.
+
+ If the given key exists in the dictionary, the behavior is exactly the same
+ as \verb|dict_lookup|.
+
+ However, if the key is not found, then the node which has the smallest key
+ that is greater than the search key is returned. If no such key exists
+ (because the search key is higher than any other key in the dictionary
+ or the dictionary is empty) then a null pointer is returned.
+
+ \example
+ Suppose that pointer \verb|d| refers to a dictionary whose registered
+ comparison function performs lexicographic comparisons on ordinary
+ C strings, similar to \verb|strcmp|. To iterate over all keys that
+ begin with the letter \verb|d|, the following idiom can be used:
+ \begin{verbatim}
+ dict_t *d;
+ dnode_t *n, *start, *end;
+ /*...*/
+ start = dict_lower_bound(d, "d");
+ end = dict_lower_bound(d, "e");
+ for (n = start; n != end; n = dict_next(d, n)) {
+ /* n points to each node in turn whose
+ key starts with 'd' */
+ }
+ \end{verbatim}
+ Note that if the dictionary is empty, or has keys which are all lower
+ than \verb|"d"|, then both \verb|start| and \verb|end| shall be null
+ pointers, and the loop body will never execute since the two are equal.
+ Also note that if there are keys that begin with \verb|d| and the
+ dictionary's last node has a key that starts with \verb|d|, then \verb|end|
+ is null, otherwise \verb|end| points to the first key that doesn't begin
+ with \verb|d|. In both cases, the loop will terminate after processing the
+ last \verb|d| key, because \verb|dict_next| shall produce a pointer that is
+ equal to \verb|end|.
+
+\subsubsection{The {\tt dict_upper_bound} function}
+
+ \indexfunc{dict_upper_bound}
+
+ \synopsis
+ \begin{verbatim}
+ dnode_t *dict_upper_bound(dict_t *, const void *);\end{verbatim}
+
+ \description
+
+ The \verb|dict_upper_bound| function searches the dictionary in a manner
+ similar to \verb|dict_lookup|.
+
+ If the given key exists in the dictionary, the behavior is exactly the same
+ as \verb|dict_lookup| with one difference:
+ If the dictionary contains more than one key which matches the search
+ key, then the last key in the sequence of duplicates is returned,
+ rather than the first.
+
+ However, if the key is not found, then the node which has the greatest key
+ that is lower than the search key is returned. If no such key exists
+ (because the search key is lower than any other key in the dictionary
+ or the dictionary is empty) then a null pointer is returned.
+
+ \example
+ The following idiom can be used to iterate over a sequence of duplicate
+ keys without the overhead of performing a full comparison before each
+ iteration to detect the first non-matching key.
+ \begin{verbatim}
+ dict_t *d;
+ void *key;
+ dnode_t *n, *start, *end;
+
+ /* ... Initialize d, and key. ...*/
+ start = dict_lower_bound(d, key);
+ end = dict_upper_bound(d, key);
+
+ /* advance end to first non-matching key */
+ if (end != 0)
+ end = dict_next(d, end);
+ else
+ end = start; /* start == dict_first(d) in this case */
+
+ for (n = start; n != end; n = dict_next(d, n)) {
+ /* n points to duplicate keys in turn */
+ }
+ \end{verbatim}
+ Immediately prior to the execution of the if statement, exactly one of the
+ following conditions is true:
+ \begin{itemize}
+ \item The key was found in the dictionary; \verb|start| points to the
+ first duplicate node and \verb|end| points to the last.
+ \item The dictionary has only higher keys than the search key; \verb|start|
+ points to the first node in the dictionary and \verb|end| is null.
+ \item The dictionary has only lower keys than the search key; \verb|end|
+ points to the last node in the dictionary, and \verb|start| is null.
+ \item The dictionary has both lower and higher keys; \verb|end| and \verb|start|
+ point to two consecutive nodes, respectively, such that the node
+ pointed at by \verb|end| has a lower key than the search key and
+ the node pointed at by \verb|start| has a higher key.
+ \item The dictionary is empty; \verb|start| and \verb|end| are null.
+ \end{itemize}
+ The if statement ensures that if the dictionary contains no matching
+ keys, than \verb|start| and \verb|end| are equal, and if the dictionary
+ contains one or more matching keys, than \verb|end| points to the first
+ non-matching node, or is null if there is no such node. Thus the loop
+ performs correctly in all circumstances.
+
+\subsubsection{The {\tt dict_insert} function}
+
+ \label{section:dict_insert}
+ \indexfunc{dict_insert}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_insert(dict_t *, dnode_t *, const void *);\end{verbatim}
+
+ \constraints
+ The dictionary is not full. If the dictionary has not been configured
+ to allow duplicate keys, the key specified by the \verb|void *| parameter
+ does not already exist in the dictionary.
+
+ \description
+ The \verb|dict_insert| function adds a new node to a dictionary. The user
+ must supply a node object that was initialized with \verb|dnode_init| or
+ dynamically created with \verb|dnode_create|. If the node is already
+ inserted into the same dictionary or any other dictionary, the behavior is
+ undefined.
+
+ Duplicate keys may be inserted into a dictionary only if the dictionary
+ has been configured to permit duplicate keys (see section
+ \ref{section:dict_allow_dupes}). If this is the case, it is also
+ permissible to insert the same key more than once: the implementation shall
+ not distinguish between distinct keys that are declared equal by a
+ correctly designed comparison function, and two key pointers that refer to
+ the same key.
+
+ A program may modify a key or node that has been inserted into a
+ dictionary, or cause the storage of the key or the node to become invalid.
+ However, any subsequent use of the dictionary invokes undefined behavior, with
+ the following exception: the data pointer stored within a node may be
+ modified using the \verb|dnode_put| function.
+
+ The Dictionary implementation shall not modify the storage referenced by a
+ key, and shall not access it other than indirectly through the supplied
+ comparison function.
+
+\subsubsection{The {\tt dict_delete} function}
+
+ \indexfunc{dict_delete}
+
+ \synopsis
+ \begin{verbatim}
+ dnode_t *dict_delete(dict_t *, dnode_t *);\end{verbatim}
+
+ \constraints
+ The specified node is an occupant of the given dictionary.
+
+ \description
+ The \verb|dict_delete| function removes from the given dictionary a
+ node that has previously been inserted into it. The key under
+ which the node was inserted is also removed from the dictionary.
+
+\subsubsection{The {\tt dict_alloc_insert} function}
+
+ \label{section:dict_alloc_insert}
+ \indexfunc{dict_alloc_insert}
+
+ \synopsis
+ \begin{verbatim}
+ int dict_alloc_insert(dict_t *, const void *, void *);\end{verbatim}
+
+ \constraints
+
+ The second argument specifies the insertion key. The dictionary shall not
+ already contain this key unless it has been configured as allowing
+ duplicates.
+
+ \description
+
+ The \verb|dict_alloc_insert| function dynamically allocates and
+ initializes a \verb|dnode_t| object and inserts it into the
+ given dictionary. The second argument and third arguments are pointers
+ to user data and key objects, either of which may be null.
+
+ The allocation is performed by a call to the default allocation
+ function, or to the function that was configured using
+ \verb|dict_set_allocator| (Section \ref{section:dict_set_allocator}).
+
+ If the allocation succeeds, the insertion is performed and
+ the value 1 is returned. If the allocation fails, no insertion is
+ performed and 0 is returned.
+
+\subsubsection{The {\tt dict_delete_free} function}
+
+ \label{section:dict_delete_free}
+ \indexfunc{dict_delete_free}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_delete_free(dict_t *, dnode_t *);\end{verbatim}
+
+ \constraints
+ The given node can be found within the given dictionary.
+
+ \description
+ The \verb|dict_delete_free| function is the reverse of
+ \verb|dict_alloc_insert|. It removes the given node form the
+ dictionary and then deletes it using the default or user-defined allocator
+ (Section \ref{section:dict_set_allocator}). If the given node
+ had not been created using \verb|dict_alloc_insert|, the behavior
+ is undefined.
+
+\subsubsection{The {\tt dict_first} function}
+
+ \indexfunc{dict_first}
+
+ \synopsis
+ \begin{verbatim}
+ dnode_t *dict_first(dict_t *);\end{verbatim}
+
+ \description
+ If the dictionary pointed at by the argument is empty, a null pointer
+ is returned. Otherwise, a pointer to the first node in that dictionary is
+ returned.
+
+\subsubsection{The {\tt dict_last} function}
+
+ \indexfunc{dict_last}
+
+ \synopsis
+ \begin{verbatim}
+ dnode_t *dict_last(dict_t *);\end{verbatim}
+
+ \description
+ If the dictionary pointed at by the argument is empty, a null pointer
+ is returned. Otherwise, a pointer to the last node in that dictionary is
+ returned.
+
+
+\subsubsection{The {\tt dict_next} function}
+
+ \indexfunc{dict_next}
+
+ \synopsis
+ \begin{verbatim}
+ dnode_t *dict_next(dict_t *, dnode_t *);\end{verbatim}
+
+ \constraints
+ The node pointed at by the second argument is an occupant of the dictionary
+ pointed at by the first argument.
+
+ \description
+ If the node pointed at by the second argument has a successor, a pointer to
+ that successor is returned. Otherwise, a null pointer is returned.
+
+ \example
+ The \verb|dict_first| and \verb|dict_next| functions can be used together
+ to iterate over all of the elements of the dictionary, as in the following
+ idiom:
+ \begin{verbatim}
+ dict_t *d;
+ dnode_t *n;
+ /*...*/
+ for (n = dict_first(d); n != 0; n = dict_next(d, n)) {
+ /* n points to each node in turn */
+ }
+ \end{verbatim}
+
+\subsubsection{The {\tt dict_prev} function}
+
+ \indexfunc{dict_prev}
+
+ \synopsis
+ \begin{verbatim}
+ dnode_t *dict_prev(dict_t *, dnode_t *);\end{verbatim}
+
+ \constraints
+ The node pointed at by the second argument is an occupant of the dictionary
+ pointed at by the first argument.
+
+ \description
+ If the node pointed at by the second argument has a predecessor, a pointer
+ to that predecessor is returned. Otherwise, a null pointer is returned.
+
+\subsubsection{The {\tt dict_count} function}
+
+ \indexfunc{dict_count}
+
+ \synopsis
+ \begin{verbatim}
+ dictcount_t dict_count(dict_t *);\end{verbatim}
+
+ \description
+ The \verb|dict_count| function returns a value which represents the number
+ of nodes currently stored in the dictionary pointed at by the argument.
+
+\subsubsection{The {\tt dict_isempty} function}
+
+ \indexfunc{dict_isempty}
+
+ \synopsis
+ \begin{verbatim}
+ int dict_isempty(dict_t *);\end{verbatim}
+
+ \description
+ The \verb|dict_isempty| function returns 1 if the given dictionary is
+ empty, otherwise it returns 0.
+
+\subsubsection{The {\tt dict_isfull} function}
+
+ \indexfunc{dict_isfull}
+
+ \synopsis
+ \begin{verbatim}
+ int dict_isfull(dict_t *);\end{verbatim}
+
+ \description
+ The \verb|dict_isfull| function returns 1 if the dictionary is full,
+ otherwise it returns 0.
+
+ If the argument is an expression with side effects, the behavior is
+ undefined.\index{macros!and side effects}
+
+\subsubsection{The {\tt dict_contains} function}
+
+ \indexfunc{dict_contains}
+
+ \synopsis
+ \begin{verbatim}
+ int dict_contains(dict_t *, dnode_t *);\end{verbatim}
+
+ \description
+ The \verb|dict_contains| function searches the given dictionary to
+ determine whether the given node is an occupant. If the node is found, 1 is
+ returned, otherwise 0 is returned.\footnote{The intent is to support
+ verification. The search may be inefficient compared to {\tt
+ dict_lookup}.}
+
+\subsubsection{The {\tt dict_allow_dupes} function}
+
+ \label{section:dict_allow_dupes}
+ \indexfunc{dict_allow_dupes}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_allow_dupes(dict_t *);\end{verbatim}
+
+ \constraints
+ The dictionary specified by the first argument shall be empty.
+
+ \description
+ The \verb|dict_allow_dupes| function configures the given dictionary to
+ support duplicate keys. This can only be done when the dictionary is empty,
+ and the change cannot be reverted.
+
+\subsubsection{The {\tt dnode_is_in_a_dict} function}
+
+ \indexfunc{dnode_is_in_a_dict}
+
+ \synopsis
+ \begin{verbatim}
+ int dnode_is_in_a_dict(dnode_t *);\end{verbatim}
+
+ \description
+ The \verb|dnode_is_in_a_dict| function reports whether the given node
+ is currently the occupant of some dictionary. If so, 1 is returned.
+ Otherwise 0 is returned.
+
+\subsubsection{The {\tt dnode_create} function}
+
+ \indexfunc{dnode_create}
+
+ \synopsis
+ \begin{verbatim}
+ dnode_t *dnode_create(void *);\end{verbatim}
+
+ \description
+ The \verb|dnode_create| function dynamically allocates a dictionary node,
+ stores in it the data value specified in the argument and
+ returns a pointer to it. The allocation is performed by a call to the
+ standard \verb|malloc| function. If the allocation fails, a null
+ pointer is returned.
+
+ The node's key pointer remains indeterminate until it is the subject of a
+ \verb|dict_insert| operation.
+
+\subsubsection{The {\tt dnode_init} function}
+
+ \indexfunc{dnode_init}
+
+ \synopsis
+ \begin{verbatim}
+ dnode_t *dnode_init(dnode_t *, void *);\end{verbatim}
+
+ \description
+ The \verb|dnode_init| function initializes the contents
+ of the specified dictionary node object, assigning it the
+ data value specified as the second argument.
+ The first argument is a pointer which refers to
+ a data object that has a suitable size and alignment
+ for the representation of an \verb|dnode_t| type.
+ After initialization with \verb|dnode_init|, the object is subsequently
+ eligible as an operand to the functions of the dictionary component,
+ other than \verb|dnode_getkey|.
+
+ The node's key pointer remains indeterminate until it is the subject of a
+ \verb|dict_insert| operation.
+
+\subsubsection{The {\tt dnode_destroy} function}
+
+ \indexfunc{dnode_destroy}
+
+ \synopsis
+ \begin{verbatim}
+ void dnode_destroy(dnode_t *);\end{verbatim}
+
+ \description
+ The \verb|dnode_destroy| function destroys a dictionary node that has been
+ allocated with \verb|dnode_create|. The value of any pointer
+ that referred to the node that was thus freed is indeterminate.
+
+ If the node is currently the occupant of a dictionary, the behavior is
+ undefined if the hash is subsequently used.
+
+\subsubsection{The {\tt dnode_get} function}
+
+ \indexfunc{dnode_get}
+
+ \synopsis
+ \begin{verbatim}
+ void *dnode_get(dnode_t *);\end{verbatim}
+
+ \description
+ The \verb|dnode_get| function retrieves the \verb|void * | data value
+ associated with the given dictionary node.
+
+\subsubsection{The {\tt dnode_getkey} function}
+
+ \indexfunc{dnode_getkey}
+
+ \synopsis
+ \begin{verbatim}
+ const void *dnode_getkey(dnode_t *);\end{verbatim}
+
+ \description
+
+ The \verb|dnode_getkey| function retrieves the \verb|void *| key value
+ associated with the given node. A node acquires an associated key
+ when it is inserted into a dictionary (see section \ref{section:dict_insert}).
+ Invoking \verb|dnode_getkey| on a node that has not been inserted
+ into a dictionary results in undefined behavior.
+
+\subsubsection{The {\tt dnode_put} function}
+
+ \indexfunc{dnode_put}
+
+ \synopsis
+ \begin{verbatim}
+ void dnode_put(dnode_t *, void *);\end{verbatim}
+
+ \description
+ The function \verb|dnode_put| replaces the data element
+ associated with the dictionary node.
+
+\subsubsection{The {\tt dict_process} function}
+
+ \label{section:dict_process}
+ \indexfunc{dict_process}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_process(dict_t *, void *, dnode_process_t);\end{verbatim}
+
+ \description
+ The \verb|dict_process| function iterates over the nodes of a dict,
+ and for each node invokes a callback function.\footnote{In most cases,
+ it is more convenient and preferable to
+ iterate over the dict using explicit calls to {\tt dict_first}
+ and {\tt dict_next}.}
+ The second argument is a {\it context pointer\/} which can have any value.
+ The third argument of
+ \verb|dict_process| shall be a pointer to a function which is compatible
+ with the specified type. If the dict contains one or more nodes,
+ then the function is invoked once for each node, in order from first
+ to last. On each invocation, the first argument of the callback is a
+ pointer to the dict; the second argument is a pointer to a node, called
+ the {\it subject node}; and the third argument repeats the context pointer
+ value that was originally passed to \verb|dict_process|.
+
+ The callback function may delete the subject node by, for instance, calling
+ \verb|dict_delete|. It may insert new nodes into the dictionary;
+ however, if such an insertion causes the subject node to acquire
+ a new successor, it is implementation-defined whether upon returning
+ from the callback function, the traversal shall continue with the
+ new successor, or with the original successor.
+
+ The callback function, and any function invoked from the callback
+ function, shall not destroy the dictionary or make any modifications
+ other than the insertion of new nodes, or the deletion of the
+ subject node.
+
+ The callback function may recursively invoke \verb|dict_process| for the
+ same dictionary or for a different dictionary; the callback invocations arising out of
+ the nested call inherit all of the restrictions of the outer callback in
+ addition to being subject to the usual restrictions.\footnote{This means,
+ for instance, that if two callbacks are in progress for different
+ subject nodes from the same dictionary, the inner callback may not delete
+ its subject node, because it inherits the restriction that the only
+ permitted deletion is the outer callback's subject node.}
+
+ The callback function may freely operate on a different dictionary,
+ subject to any inherited restrictions.
+
+\subsubsection{The {\tt dict_load_begin} function}
+
+ \label{section:dict_load_begin}
+ \indexfunc{dict_load_begin}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_load_begin(dict_load_t *, dict_t *);\end{verbatim}
+
+ \constraints
+ The dictionary specified by the second argument is empty.
+
+ \description
+ The \verb|dict_load_begin| function prepares a context object
+ for the task of constructing the contents of a dictionary out of
+ a sequence of elements which is already sorted according to the
+ sorting function of the dictionary.\footnote{This process is more efficient
+ than inserting all of the elements into a dictionary using {\tt dict_insert}.
+ In the reference implementation, this process runs in linear time, or $O(n)$
+ whereas construction by repeated insertions runs in $O(n\log n)$ time.}
+ The actual construction is performed
+ by zero or more calls to \verb|dict_load_next| and is finalized by
+ \verb|dict_load_end|.
+
+ The \verb|dict_load_begin| function is said to bind the dictionary
+ and context object together; the only way to unbind the two
+ is by calling \verb|dict_load_end| on the context object.
+
+ The program shall not manipulate a dictionary that is bound to
+ a context object, other than by calling \verb|dict_load_next|.
+
+ The program shall not attempt to bind a dictionary to more than one context
+ object simultaneously, or a context object to more than one dictionary
+ simultaneously.
+
+\subsubsection{The {\tt dict_load_next} function}
+
+ \label{section:dict_load_next}
+ \indexfunc{dict_load_next}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_load_next(dict_load_t *, dnode_t *, const void *);\end{verbatim}
+
+ \constraints
+ The node pointed at by the second argument is not an occupant of
+ any dictionary. The key specified by the third argument is greater
+ than or equal to all keys specified in previous calls to
+ \verb|dict_load_next| in the context of the same construction,
+ according to the comparison function of the dictionary that is
+ being constructed. That is to say, successive calls specify monotonically
+ increasing keys.
+ The dictionary is not full.
+
+ \description
+ The \verb|dict_load_next| function continues the construction of a
+ dictionary from an ordered list of elements by specifying the next
+ node in the sequence, along with its key. After this call, the node
+ is considered to be inserted into the dictionary as if by
+ \verb|dict_insert|.
+
+\subsubsection{The {\tt dict_load_end} function}
+
+ \label{section:dict_load_end}
+ \indexfunc{dict_load_end}
+
+ \synopsis
+ \begin{verbatim}
+ void dict_load_end(dict_load_t *);\end{verbatim}
+
+ \description
+ The \verb|dict_load_end| function finalizes the construction of
+ a dictionary from a ordered sequence. It breaks the binding between
+ the \verb|dict_load_t| context object and the dictionary.
+
+\subsection{Implementation}
+
+TODO
+
+\section{Exception component}
+\label{section:exception_component}
+\index{Exception}
+
+The Exception component provides distributed error handling in the form of
+exceptions, behind an interface designed to be implementable using only the
+portable features of standard C. The features of this interface are:
+\begin{itemize}
+\item the ability to set up nested try-catch regions which declare specific
+exceptions that they can handle;
+\item grouped exceptions, allowing handlers to catch specific exceptions,
+or any exception within a group;
+\item the ability to designate a function that is called in the event
+that an exception is thrown that has no handler.
+\item a mechanism for releasing resources acquired by code that is terminated
+by an exception;
+\item the ability to pass dynamically allocated data from the throw site to the
+catch site.
+\end{itemize}
+
+An exception is simply a means of returning to a prior place in the program's
+execution. The ANSI C language provides crude, but portable, exception handling
+consisting of the \verb|jmp_buf| type, the \verb|setjmp| macro and the
+\verb|longjmp| function. The Kazlib Exception component can be implemented in
+terms of these primitives. The constraint to implementability in standard C
+leads to a number of concessions:
+\begin{itemize}
+\item A program can leave cleanup regions and try-catch regions by improper
+means, such as using \verb|goto|, \verb|return| or \verb|break|. This is
+difficult to diagnose, and is simply documented as undefined behavior.
+There is no support in the standard language for designating code that is
+executed whenever a statement block terminates by any means.
+\item For the same reason, the exception handling interface described here
+has an explicit mechanism for deallocation of resources associated with
+statement blocks that are terminated by exceptions. This interface is
+not as convenient as language support for automatic cleanup. Correct
+management of temporary dynamic resources using this interface requires
+programmer discipline.
+\item The requirement to be able to use \verb|setjmp| to save a context
+to be later returned to during exception processing brings in restrictions
+related to non-volatile objects. If non-volatile objects are modified
+between the time an exception handling region is initiated and the time
+an exception is caught in the region, these objects have indeterminate
+values.\footnote{This liberty in ANSI C allows compiler
+or library writers to implement {\tt setjmp} as a simple mechanism that
+takes a snapshot of the machine context. Objects that are optimized into
+special storage---such as registers---and whose values change since the
+context saving operation will be clobbered when the context is restored
+by {\tt longjmp}.}
+\end{itemize}
+
+\subsection{Interface}
+
+\subsubsection{The {\tt except.h} header}
+
+Each C or C++ translation unit that is to use the functionality of the Exception
+component shall include the header \verb|except.h|. This header shall
+contain declarations of types and external functions, and definitions of
+macros. The following typedef names shall be
+defined:\index{Exception!typedef names}
+\begin{verbatim}
+ except_id_t
+ except_t
+\end{verbatim}
+The following external function names shall be declared:
+\index{Exception!function names}\index{functions!defined by Exception}
+\begin{verbatim}
+ except_init except_group
+ except_deinit except_message
+ except_rethrow except_data
+ except_throw except_take_data
+ except_throwd except_set_allocator
+ except_throwf except_alloc
+ except_unhandled_catcher except_free
+ except_code
+\end{verbatim}
+The following preprocessor symbols shall be
+defined: \index{Exception!macro names}\index{macros!defined by Exception}
+\indexmacro{XCEPT_H}
+\begin{verbatim}
+ XCEPT_H except_cleanup_pop
+ XCEPT_GROUP_ANY except_checked_cleanup_pop
+ XCEPT_CODE_ANY except_try_push
+ XCEPT_BAD_ALLOC except_try_pop
+ except_cleanup_push
+\end{verbatim}
+Finally, these two enum constants are defined:
+\begin{verbatim}
+ except_no_call
+ except_call
+\end{verbatim}
+\index{symbols!reserved by Exception}\index{Exception!reserved symbols} Macro
+identifiers which begin with the upper-case prefix \verb|XCEPT|\footnote{The
+prefix {\tt XCEPT} is used rather than {\tt EXCEPT} because ISO 9899 reserves
+preprocessor symbols beginning with {\tt E} followed by a digit or
+capital letter for future extensions to the {\tt <errno.h>} header.}
+are reserved for future extensions to the \verb|except.h|
+header, as are names in the ordinary and tag namespaces which begin with
+\verb|except_|. External names which begin with \verb|except_| are reserved by
+the Kazlib library regardless of what headers are included.
+
+\subsubsection{The {\tt except_id_t} type}
+
+\label{section:except_id_t}
+\indextype{except_id_t}
+\indexmacro{XCEPT_GROUP_ANY}
+\indexmacro{XCEPT_CODE_ANY}
+The type \verb|except_id_t| is an aggregate consisting of two unsigned long
+values which represent an {\it exception group\/} and {\it exception code},
+respectively, in that order.\footnote{Thus, the program may initialize
+an {\tt except_id_t} object using two brace-enclosed initializers which
+specify the group and code.} An exception group is a value which identifies a
+group of related exceptions. An exception code is a value which identifies a
+specific exception uniquely within a group. The codes are assigned by the
+program designer. The Exception component reserves only the group and code
+values of zero, which, when used to specify a catch, match any value.
+
+The preprocessor symbols \verb|XCEPT_GROUP_ANY| and
+\verb|XCEPT_CODE_ANY| each expand to a constant integral expression having the
+value zero. These symbols are intended, in a catch specification, to clearly
+convey that any exception or any group is being caught.
+
+The preprocessor symbol \verb|XCEPT_BAD_ALLOC| expands to an integral constant
+expression having the value 1. This symbol is intended to represent the
+standard exception group for failed memory allocations.
+(See section \ref{section:except_throwf}).
+
+The exception groups from 1 to 15 are reserved for implementation use.
+
+\subsubsection{The {\tt except_t} type}
+
+\indextype{except_t}
+An object of type \verb|except_t| keeps track of all of the information that is
+passed when an exception is thrown, and is known as an {\it exception
+descriptor}. The type is opaque, hence the program shall manipulate this type
+using only the interface functions provided.
+
+\subsubsection{The {\tt except_init} function}
+
+ \indexfunc{except_init}
+
+ \synopsis
+ \begin{verbatim}
+ int except_init(void);\end{verbatim}
+
+ \description
+ The \verb|except_init| function allocates resources needed by the
+ Exception component. Before using any of the other exception interface
+ functions or macros, the program shall perform at least one successful call
+ to \verb|except_init|.
+
+ If the initialization succeeds, \verb|except_init| returns 1. Otherwise
+ it returns 0.
+
+ The \verb|except_init| function may be called more than once. After a
+ successful call, every subsequent call shall be successful up to an
+ implementation-defined maximum number of repetitions, which shall be at least
+ as large as the \verb|INT_MAX| from \verb|limits.h|. \footnote{
+ The intent is to support, but not enforce, a style of global initialization
+ whereby each module which requires the use of another module calls its
+ initialization function from its own initialization function. Only the
+ first such call performs the initialization of the module; subsequent calls
+ merely increment a counter. During deinitialization, the counter is
+ decremented and cleanup takes place when the counter reaches zero.}
+
+\subsubsection{The {\tt except_deinit} function}
+
+ \indexfunc{except_deinit}
+
+ \synopsis
+ \begin{verbatim}
+ void except_deinit(void);\end{verbatim}
+
+ \description
+ The \verb|except_deinit| function releases the resources
+ that were allocated by \verb|except_init|.
+
+ For the resource deallocation to actually take place, the
+ \verb|except_deinit| must be called as many times as the
+ number of times \verb|except_init| was successfully called.
+
+ If \verb|except_deinit| is called more times than \verb|except_init| is
+ successfully called, the behavior is undefined.
+
+\subsubsection{The {\tt except_rethrow} function}
+
+ \indexfunc{except_rethrow}
+
+ \synopsis
+ \begin{verbatim}
+ void except_rethrow(except_t *);\end{verbatim}
+
+ \description
+ The rethrow function is used to rethrow a caught exception. The argument
+ shall not be null. An exception shall not be rethrown from outside of the
+ {\it try-catch region\/} in which it was caught. An exception shall not be
+ rethrown from a try-catch region other than the one in which it was caught.
+ It shall not be rethrown from a try-catch or cleanup region enclosed within
+ the one in which it was caught.
+
+ When an exception is rethrown, the search for a handler does not begin with
+ the region in which the exception was caught. Instead, this region is
+ terminated, and the search continues with the enclosing one, if one
+ exists.
+
+\subsubsection{The {\tt except_throw} function}
+
+ \indexfunc{except_throw}
+
+ \synopsis
+ \begin{verbatim}
+ void except_throw(long, long, const char *);\end{verbatim}
+
+ \constraints
+ The first two arguments specify the exception group and code,
+ respectively. Neither of these arguments shall be zero.
+
+ \description
+ The \verb|except_throw| function causes an exception to be thrown.
+
+ If the throw takes place in a try-catch region where an exception
+ was just caught, this original exception is considered handled. In
+ this case, the new exception is still eligible for handling by the
+ same try-catch region.
+
+ The third argument points to the first character of a string
+ which becomes the {\it exception message}. Because the throwing of
+ the exception may cause the current statement block to terminate,
+ this string data shall be non-local. It may be a string literal, since the
+ implementation shall not modify the message, or it may be an ordinary
+ object of static duration. If it is dynamic data, it becomes the handler's
+ responsibility to extract the message from the caught exception and
+ free the data.\footnote{The programmer should consider using
+ {\tt except_throwd} to pass arbitrary dynamic data from the throw
+ site to the try-catch region.}
+
+ The \verb|except_throw| function does not return. The implementation
+ searches for a suitable try-catch region starting with the one
+ initiated by the most recent \verb|except_try_push|. If there
+ is no enclosing region, the search fails. Otherwise if a match is found,
+ execution continues at the start of the target try-catch region, appearing
+ to be a second return from \verb|except_try_push| distinguished by a non-null
+ value of the \verb|except_t *| object.
+
+ If no match is found during exception processing, the exception is
+ handled internally by the implementation. The implementation then
+ calls the currently registered function for catching unhandled
+ exceptions (see section \ref{section:except_unhandled_catcher}).
+
+ The default catcher for unhandled exceptions shall terminate the program
+ with a diagnostic which identifies the code, group and exception message.
+
+ During the search for an exception handler, cleanup handlers may be
+ encountered. They are removed from the inside out and called with
+ their registered arguments. This process is called {\it unwinding}.
+ \index{unwinding}
+
+\subsubsection{The {\tt except_throwd} function}
+
+ \indexfunc{except_throwd}
+
+ \synopsis
+ \begin{verbatim}
+ void except_throwd(long, long, const char *, void *);\end{verbatim}
+
+ \constraints
+ The first two arguments specify the exception group and code,
+ respectively. Neither of these arguments shall be zero.
+
+ \description
+ The \verb|except_throwd| function is the same as \verb|except_throw| in
+ every respect except that it has an additional \verb|void *| parameter. A
+ null argument may be used for this parameter, or it may be any valid
+ pointer value.
+
+ When the exception is handled, and the handler does not remove this pointer
+ using \verb|except_take_data| then the implementation shall automatically
+ invoke the function \verb|except_free| on this pointer.
+
+\subsubsection{The {\tt except_throwf} function}
+
+ \indexfunc{except_throwf}
+ \label{section:except_throwf}
+
+ \synopsis
+ \begin{verbatim}
+ void except_throwf(long, long, const char *, ...);\end{verbatim}
+
+ \constraints
+ The first two arguments specify the exception group and code,
+ respectively. Neither of these arguments shall be zero.
+
+ \description
+
+ This function is almost exactly the same as \verb|except_throw|
+ except that the exception message is not directly specified.
+ Instead, the \verb|char *| argument specifies a format string which may be
+ followed by trailing arguments. The format string and trailing arguments
+ are interpreted as the format string and arguments of the standard C
+ function \verb|printf| and are subject to the same requirements.
+
+ The format string is interpreted, and the results of formatting are placed into
+ buffer provided by the implementation. The implementation shall provide
+ space for at least 1024 bytes of storage for the result of the formatting,
+ including the null terminator byte. If the formatting requires more space
+ than the implementation provides, the behavior is undefined.
+
+ The results of the formatted print shall become the exception message
+ of the thrown exception.
+
+ If the implementation is unable to allocate resources for the formatted
+ message, it shall throw a code 1 exception having an unspecified code in
+ group \verb|XCEPT_BAD_ALLOC| with an implementation-defined message.
+ (See section \ref{section:except_id_t}).
+
+\subsubsection{The {\tt except_unhandled_catcher} function}
+
+ \label{section:except_unhandled_catcher}
+ \indexfunc{except_unhandled_catcher}
+
+ \synopsis
+ \begin{verbatim}
+ void (*except_unhandled_catcher(void (*)(except_t *)))
+ (except_t *);\end{verbatim}
+
+ \description
+ The \verb|except_unhandled_catcher| function installs a new
+ function for catching unhandled exceptions. The argument is a
+ pointer to a catching function that returns nothing, and accepts a pointer
+ of type \verb|except_t *|. A pointer to the previously installed
+ catching function is returned. If the program did not previously
+ install a catching function, then a pointer to the default catching
+ function is returned. The program may retain this pointer and
+ use it to reinstall the default function.
+
+ A function for catching unhandled exceptions should not return. If it
+ returns, the implementation shall terminate the program with a diagnostic.
+
+\subsubsection{The {\tt except_code} function}
+
+ \indexfunc{except_code}
+
+ \synopsis
+ \begin{verbatim}
+ unsigned long except_code(except_t *);\end{verbatim}
+
+ \description
+ The \verb|except_code| is an accessor function which returns the
+ exception code of the given exception descriptor.
+
+\subsubsection{The {\tt except_group} function}
+
+ \indexfunc{except_group}
+
+ \synopsis
+ \begin{verbatim}
+ unsigned long except_group(except_t *);\end{verbatim}
+
+ \description
+ The \verb|except_group| is an accessor function which returns the
+ exception group of the given exception descriptor.
+
+\subsubsection{The {\tt except_message} function}
+
+ \indexfunc{except_message}
+
+ \synopsis
+ \begin{verbatim}
+ const char *except_message(except_t *);\end{verbatim}
+
+ \description
+ The \verb|except_group| is an accessor function which returns
+ a pointer to the string of text that was specified when the
+ exception was thrown (the exception message).
+
+\subsubsection{The {\tt except_data} function}
+
+ \indexfunc{except_data}
+
+ \synopsis
+ \begin{verbatim}
+ void *except_data(except_t *);\end{verbatim}
+
+ \description
+ The \verb|except_group| returns the data pointer that
+ was specified in the \verb|except_throwd| call.
+ If the exception was not thrown by \verb|except_throwd|
+ the return value is unspecified.
+
+
+\subsubsection{The {\tt except_take_data} function}
+
+ \indexfunc{except_take_data}
+
+ \synopsis
+ \begin{verbatim}
+ void *except_take_data(except_t *);\end{verbatim}
+
+ \description
+ The \verb|except_take_data| returns the data pointer that
+ was specified in the \verb|except_throwd| call, and
+ updates the exception descriptor so that the pointer is
+ set to null.
+
+ If the exception was not thrown by \verb|except_throwd|
+ the result is unspecified.
+
+\subsubsection{The {\tt except_cleanup_push} macro}
+
+ \indexmacro{except_cleanup_push}
+
+ \synopsis
+ \begin{verbatim}
+ void except_cleanup_push(void (*)(void *), void *);\end{verbatim}
+
+ \description
+ The call to \verb|except_cleanup_push| shall be matched with a call to
+ \verb|except_cleanup_pop| which must occur in the same statement block at
+ the same level of nesting.\footnote{This requirement allows an implementation
+ to provide an {\tt except_cleanup_push} macro which opens up a statement
+ block and a {\tt except_cleanup_pop} which closes the statement block.
+ The space for the registered pointers can then be efficiently allocated
+ from automatic storage.}
+
+ The \verb|except_cleanup_push| macro registers a cleanup handler that will
+ be called if an exception subsequently occurs before the matching
+ \verb|except_cleanup_pop| is executed, and is not intercepted and handled by
+ a try-catch region that is nested between the two.
+
+ The first argument to \verb|except_cleanup_push| is a pointer
+ to the cleanup handler, a function that returns nothing and takes
+ a single argument of type \verb|void *|. The second argument
+ is a \verb|void *| value that is registered along with the handler.
+ This value is what is passed to the registered handler, should it
+ be called.
+
+ Cleanup handlers are called in the reverse order of their nesting: inner
+ handlers are called before outer handlers.
+
+ The program shall not leave the cleanup region between the call to the macro
+ \verb|except_cleanup_push| and the matching call to
+ \verb|except_cleanup_pop| by means other than throwing an exception, or
+ calling \verb|except_cleanup_pop|.
+
+ Within the call to the cleanup handler, it is possible that new exceptions
+ may happen. Such exceptions must be handled before the cleanup handler
+ terminates. If the call to the cleanup handler is terminated by an
+ exception, the behavior is undefined.\footnote{The exception which triggered
+ the cleanup is not yet caught; thus the program would be effectively trying
+ to replace an exception with one that isn't in a well-defined state.}
+
+\subsubsection{The {\tt except_cleanup_pop} macro}
+
+ \indexmacro{except_cleanup_pop}
+ \label{section:except_cleanup_pop}
+
+ \synopsis
+ \begin{verbatim}
+ void except_cleanup_pop(int);\end{verbatim}
+
+ \description
+ A call to the \verb|except_cleanup_pop| macro shall match each
+ call to \verb|except_cleanup_push| which shall be in the
+ same statement block at the same nesting level. It shall
+ match the most recent such a call that is not matched
+ by a previous \verb|except_cleanup_pop| at the same level.
+
+ This macro causes the registered cleanup handler to be removed. If, and
+ only if the argument is other than zero, the cleanup handler is called.
+ In that case, the registered context pointer is passed to the cleanup
+ handler.
+
+ \indexenum{except_no_call}
+ \indexenum{except_call}
+ The enumeration constants \verb|except_no_call| and \verb|except_call|
+ may be used as arguments to this function instead of
+ the equivalent constants \verb|0| and \verb|1|.
+
+ The program shall not leave the region between the call to the macro
+ \verb|except_cleanup_push| and the matching call to
+ \verb|except_cleanup_pop| other than by throwing an exception, or
+ by executing the \verb|except_cleanup_pop|.
+
+\subsubsection{The {\tt except_checked_cleanup_pop} macro}
+
+ \indexmacro{except_checked_cleanup_pop}
+
+ \synopsis
+ \begin{verbatim}
+ void except_checked_cleanup_pop(void (*)(void *), int);\end{verbatim}
+
+ \constraints
+ The first pointer-to-function argument shall match the pointer value that
+ was registered by the matching \verb|except_cleanup_push| macro.
+
+ \description
+ The \verb|except_checked_cleanup_pop| macro may be used as an alternative to
+ \verb|except_cleanup_pop|. In verification mode, the constraint serves to
+ provide additional safety by making an explicit declaration regarding which
+ handler is being called (or ignored, as the case may be).
+
+ The program shall not leave the region between the call to the macro
+ \verb|except_cleanup_push| and the call to
+ \verb|except_checked_cleanup_pop| by means other than throwing an
+ exception, or executing the latter macro.
+
+\subsubsection{The {\tt except_try_push} macro}
+
+ \indexmacro{except_try_push}
+ \label{section:except_try_push}
+
+ \synopsis
+ \begin{verbatim}
+ void except_try_push(const except_id_t [],
+ size_t, except_t **);\end{verbatim}
+
+ \description
+ The \verb|except_try_push| marks the beginning of a try-catch region
+ of the program. It must be matched by a \verb|except_try_pop| written in
+ the same statement block at the same level of nesting, which
+ terminates the try-catch region. Regions may be nested.
+
+ The program shall not leave a try-catch region other than by throwing
+ an exception or by executing the \verb|except_try_pop|.\footnote{Thus,
+ leaving the try-catch region using {\tt goto}, {\tt return},
+ {\tt break} or {\tt continue} leads to undefined behavior.}
+
+ The first argument is a pointer to the first element of an array of
+ \verb|except_id_t| objects, the number of elements of which is specified by
+ the second argument. The array specifies which exceptions are caught.
+ The implementation shall treat this array as read-only.\footnote{Thus,
+ the program may allocate the array in static storage.}
+
+ The third argument of \verb|except_try_push| shall point to an object
+ of type \verb|except_t *|. After the call to \verb|except_try_push|,
+ the program shall inspect the value of this object. A null value indicates
+ that no exception has been thrown. A non-null value indicates that an
+ exception was thrown, and is now caught. In other words, when an exception
+ is caught by a try-catch region, then control passes from the throw site
+ back to the first statement after the \verb|except_try_push| statement of
+ the try-catch region. This case is distinguished from an ordinary return by
+ the non-null value of the pointer object that was specified by the third
+ argument of the earlier call to \verb|except_try_push|.
+
+ An exception is considered handled if it is caught in a try-catch region
+ which subsequently terminates by executing its \verb|except_try_pop| or by
+ throwing another exception. When an exception is considered handled, any
+ dynamic data that was associated with that exception is
+ freed.\footnote{Dynamic data may be explicitly associated with an exception
+ using {\tt except_throwd}. Other types of throw may associate unspecified
+ dynamic data.} It's possible for more than one exception to be active
+ at once. During the processing of one exception, a try-catch region
+ which catches the exception may execute a nested try-catch region
+ in which independent exception processing takes place. Provided that
+ no exception escapes from the inner try-catch region, the original
+ exception remains pending. But if an exception escapes from the inner
+ region, it causes the original exception to be handled.\footnote{Thus, a
+ given try-catch region cannot catch multiple exceptions concurrently.}
+
+ The caught exception may be rethrown by calling \verb|except_rethrow|,
+ specifying the the value of the caught exception descriptor as the
+ argument. Rethrowing a caught exception causes the innermost try-catch
+ region to terminate, but the exception is not considered handled. The
+ search for a handler continues with the second most enclosing region.
+
+ Throwing a new exception during the handling of a caught exception may
+ cause the {\it same\/} try-catch region to catch that exception; the
+ try-catch region is not terminated until it is determined that it doesn't
+ catch the new exception.
+
+ Each entry in the array of \verb|except_id_t| objects specifies what
+ exceptions are caught by the try-catch region. When an exception is
+ thrown, the implementation searches for the inner-most try-catch region
+ which has at least one match for the thrown exception in its catch
+ specification array.
+
+ A match occurs when a specification exactly matches the group and code of
+ the thrown exception. If a catch specification is for group 0, then it
+ matches any group. If a catch specification is for code 0, then it matches
+ any exception code. A catch specification of group 0 and code 0 catches all
+ exceptions.
+
+ Non-volatile automatic variables that are local to the function containing
+ the try-catch region, and that are modified after \verb|except_try_push|
+ begins the try-catch region have indeterminate values when an exception is
+ caught.
+
+ Once a caught exception is handled or re-thrown, the value of the
+ \verb|except_t *| pointer which referenced it becomes indeterminate.
+ If a re-thrown exception is caught again, the implementation shall
+ produce a valid \verb|except_t *| pointer.
+
+ \example
+ The following example illustrates the use of \verb|except_try_push| and
+ related macros and functions.
+ \begin{verbatim}
+ #include <stdlib.h>
+ #include <assert.h>
+ #include "except.h"
+
+ #define MY_GROUP 42
+ #define MY_CODE 1
+
+ static void func_that_throws(void)
+ {
+ except_throw(MY_GROUP, MY_CODE, "this is an exception");
+ }
+
+ static void func_that_cleans_up(void)
+ {
+ void *local_data = malloc(10);
+
+ except_cleanup_push(free, local_data);
+ func_that_throws();
+ except_checked_cleanup_pop(free, except_call);
+ }
+
+ void func_that_catches(void)
+ {
+ /* catch specification */
+ static const except_id_t catch_spec[] = {
+ { MY_GROUP, XCEPT_CODE_ANY }
+ };
+ /* exception handle */
+ except_t *exc;
+
+ except_try_push(catch_spec, 1, &exc);
+
+ /*
+ * Start of try-catch region: when exception is
+ * thrown, control returns here.
+ */
+
+ if (exc == 0) {
+ /* try code that may throw an exception */
+
+ func_that_cleans_up();
+ } else {
+ /* handle exception that was thrown */
+
+ assert (except_group(exc) == MY_GROUP);
+ printf("exception caught: %s %ld %ld\n",
+ except_message(exc),
+ except_group(exc), except_code(exc));
+
+ goto terminate; /* ERROR! jumping out of try-catch */
+ }
+
+ /* end of try-catch region */
+
+ except_try_pop();
+ terminate:
+ ;
+ }
+ \end{verbatim}
+ In this example, the function \verb|func_that_catches| is intended to be
+ called first. It sets up a try-catch region which traps exceptions having
+ the group identification \verb|MY_GROUP| (or 42). Any code within that
+ group is caught because the code catch was specified as
+ \verb|XCEPT_CODE_ANY|. When the \verb|except_try_push| macro is executed,
+ it sets the value of \verb|exc| to null. Then \verb|func_that_cleans_up| is
+ called, which throws an exception in the \verb|MY_GROUP| group. This
+ exception is caught, so control resumes at the top of the try-catch region,
+ with \verb|exc| set to a non-null value. Thus the else clause of the if
+ statement is now executed. The handling code simply prints the exception
+ message on standard output, as well as the numeric group and code. The
+ subsequent goto statement demonstrates a serious programming error.
+
+ The \verb|func_that_cleans_up| function illustrates the use of cleanup
+ regions. Dynamic memory is allocated which must not be allowed to leak
+ when an exception is thrown, so a cleanup handler is set up to free the
+ memory in that event. The standard C function \verb|free| happens to have,
+ the right type signature and semantics that it can be used directly as a
+ cleanup handler. Should no exception be thrown, the cleanup pop macro
+ will perform the call to the cleanup handler, because it is invoked with
+ argument \verb|except_call|.
+
+\subsubsection{The {\tt except_try_pop} macro}
+
+ \indexmacro{except_try_pop}
+
+ \synopsis
+ \begin{verbatim}
+ void except_try_pop(void);\end{verbatim}
+
+ \description
+
+ The \verb|except_try_pop| macro terminates a try-catch region. It must
+ match a previous \verb|except_try_push| macro in the same statement
+ block at the same level of nesting which is not already matched by an
+ earlier \verb|except_try_pop|.
+
+\subsubsection{The {\tt except_set_allocator} function}
+
+ \indexfunc{except_set_allocator}
+ \label{section:except_set_allocator}
+
+ \synopsis
+ \begin{verbatim}
+ void except_set_allocator(void *(*)(size_t), void (*)(void *));\end{verbatim}
+
+ \description
+ The \verb|except_set_allocator| function installs a pair of allocator
+ routines that will be used by the Exception component for future allocation
+ and deallocation requests.
+
+ The first argument points to a function that resembles the standard C
+ \verb|malloc| in type and semantics. The second argument points to a
+ function that similarly resembles the standard C function \verb|free|.
+
+ The default allocators are \verb|malloc| and \verb|free|.
+ The call
+ \begin{verbatim}
+ except_set_allocator(malloc, free);
+ \end{verbatim}
+ may be used to restore these default allocator functions.
+
+ The program shall not call \verb|except_set_allocator| if an exception
+ was thrown and has not yet been handled.\footnote{Doing so could, for example,
+ create a mismatch whereby a pointer to data allocated with the previously installed
+ allocator function would be passed to the new deallocator function.}
+
+ The allocator function shall create a unique object consisting of at least
+ as many bytes of storage as indicated by the value of the argument.
+ The pointer returned shall be suitably aligned to represent an object
+ of any type. If insufficient resources exist, the pointer returned shall be
+ null. Requesting an object of zero size may produce a unique pointer
+ that shall be acceptable to the deallocator function, or a null pointer.
+
+ The deallocator function shall be capable of destroying objects created
+ by the corresponding allocator function. Passing a null pointer to the
+ deallocator shall have no effect.
+
+\subsubsection{The {\tt except_alloc} function}
+
+ \indexfunc{except_alloc}
+
+ \synopsis
+ \begin{verbatim}
+ void *except_alloc(size_t);\end{verbatim}
+
+ \description
+ The \verb|except_alloc| function allocates memory using the default
+ memory allocator or one installed by the program.
+ (See section \ref{section:except_set_allocator}).
+
+ If the allocation succeeds, a non-null pointer to the allocated object is
+ returned.
+
+ If the allocator indicates failure by returning a null pointer,
+ then instead of returning, \verb|except_alloc| throws exception code 1
+ in the group \verb|XCEPT_BAD_ALLOC| (See section \ref{section:except_id_t}).
+
+ If a zero size request is specified, then an exception is thrown or
+ a non-null pointer is returned, depending on the treatment of such
+ requests by the underlying allocator.
+
+\subsubsection{The {\tt except_free} function}
+
+ \indexfunc{except_free}
+
+ \synopsis
+ \begin{verbatim}
+ void *except_free(void *);\end{verbatim}
+
+ \description
+
+ The \verb|except_free| function releases memory that was allocated
+ using \verb|except_alloc|. The deallocation is performed using the
+ default allocator or one installed by the program.
+
+ If an object is allocated by \verb|except_alloc|, then a
+ different allocator is installed, and the object is freed using
+ \verb|except_free|, the behavior is undefined.
+
+\subsection{Implementation}
+\index{Exception component!reference implementation}
+
+Described here is a reference implementation of the exception handling
+interface that is covered in section \ref{section:exception_component}
+The reference implementation requires only a conforming ANSI C implementation.
+In particular, the actual mechanism for passing control from an exception throw
+to a catch handler is based on the standard C \verb|setjmp| macro and
+\verb|longjmp| function.
+
+\subsubsection{Overview}
+
+The core structure in the exception handling implementation is a stack that is
+composed of a mixture of two types of nodes: cleanup nodes and catch nodes.
+When an exception is thrown, the stack nodes are popped and processed starting
+with the topmost one.
+
+The nodes are efficiently allocated in automatic storage by the macros
+\verb|except_cleanup_push| and \verb|except_try_push|. These macros
+open up a new statement block and declare the node information in automatic
+storage. These objects are then pushed onto the stack. The corresponding macros
+\verb|except_cleanup_pop| and \verb|except_try_pop| pop the node off the stack
+and close the statement block.
+
+An static variable keeps track of the stack top. In the multi-threaded variant
+of the code which is based on the POSIX threading interface, there is a
+thread-specific stack top created using the thread-specific function
+pthread_key_create. Using global variables is a compromise that simplifies the
+interface; the throw functions simply ``know'' where the thread's exception
+stack is, so the context information doesn't have to be passed around.
+
+\subsubsection{Stack nodes}
+
+A node in the exception handling stack contains a pointer to the next
+node below, followed by a type field and a union which together keep
+track of the appropriate type-specific data:
+\begin{verbatim}
+ enum except_stacktype {
+ XCEPT_CLEANUP, XCEPT_CATCHER
+ };
+
+ struct except_stacknode {
+ struct except_stacknode *except_down;
+ enum except_stacktype except_type;
+ union {
+ struct except_catch *except_catcher;
+ struct except_cleanup *except_cleanup;
+ } except_info;
+ };
+\end{verbatim}
+The union overlaps pointers to structures instead of structures in order to
+save space: there is a disparity in size between a cleanup node and a catch
+node, so making them both use the same amount of space would be wasteful.
+The space saving comes at a price, because the pointers themselves take up
+extra space and time is spent initializing them. Some casting trickery
+could be used to create a stack having two different kinds of structures
+without the use of unions.
+
+\paragraph{Cleanup nodes}
+
+Cleanup nodes act as placeholders for a pointer to a cleanup handler function
+and a context pointer to be passed to that function. The type-dependent
+component of the cleanup node is declared like this:
+\begin{verbatim}
+ struct except_cleanup {
+ void (*except_func)(void *);
+ void *except_context;
+ };
+\end{verbatim}
+The cleanup handler is invoked when the node is popped during exception
+processing. A cleanup handler may also be invoked when the cleanup node is
+removed by executing \verb|except_cleanup_pop| or
+\verb|except_checked_cleanup_pop|. Whether or not this happens depends on the
+integer parameter that is documented in section
+\ref{section:except_cleanup_pop}.
+
+\paragraph{Catch nodes}
+
+The catch node structure is more complicated than the cleanup node.
+Its definition depends on two additional types, \verb|except_id_t|
+and \verb|except_t|, both of which also make play a role in the exception
+component's interface.
+\begin{verbatim}
+ typedef struct {
+ unsigned long except_group;
+ unsigned long except_code;
+ } except_id_t;
+
+ typedef struct {
+ except_id_t except_id;
+ const char *except_message;
+ void *except_dyndata;
+ } except_t;
+
+ struct except_catch {
+ const except_id_t *except_id;
+ size_t except_size;
+ except_t except_obj;
+ jmp_buf except_jmp;
+ };
+\end{verbatim}
+The \verb|except_id| member of the \verb|except_catch| structure is a pointer to the
+array of \verb|except_id_t| objects which specify what exceptions the node
+catches. The \verb|except_size| member specifies the number of elements in the array.
+Both of these values are derived directly from the arguments of the
+\verb|except_try_push| macro (see section \ref{section:except_try_push}). The
+\verb|except_obj| member provides storage for the caught exception. This member is
+the means by which the thrown exception is communicated to the try-catch region
+where it is caught. It contains the group and code identifiers, the exception
+message and, optionally, the pointer to arbitrary exception data. The
+\verb|except_jmp| member is the standard C \verb|jmp_buf|---a place for saving the
+execution context so that it's possible to pass control, via \verb|longjmp|
+from the place where an exception is thrown to the place where it is caught.
+
+If, during the search for an exception handler, a catch node is encountered
+which matches the thrown exception, the node remains the stack. The exception
+information is stored into into the node's \verb|except_obj| member and a
+\verb|longjmp| is executed to return to the try-catch region in which the node
+was allocated and pushed. Because the node is still on the stack, it's possible
+to throw another exception which is caught again by the same node. When an
+exception is thus caught, control resumes just after the \verb|except_throw|
+which placed the node onto the stack. The pointer passed into \verb|except_throw|
+is updated to point to the \verb|except_obj| member of the catch structure.
+The program can then use the portable accessor functions such as
+\verb|except_code| to gain information about the caught exception and handle it
+accordingly.
+
+\index{external names|see {functions}}
+\index{reference implementation|see {implementation}}
+\index{names|see {symbols}}
+\index{identifiers|see {symbols}}
+\index{structure names|see{tags}}
+\index{preprocessor symbols|see{macros}}
+\index{defines|see{macros}}
+\index{reserved symbols|see{symbols}}
+\index{symbols!preprocessor|see{macros}}
+\index{symbols!type names|see{typedefs}}
+\index{symbols!function names|see{functions}}
+\printindex
+
+\end{document}
diff --git a/libutil/kazlib/drivers/dict-main.c b/libutil/kazlib/drivers/dict-main.c
new file mode 100644
index 0000000..08f2e7a
--- /dev/null
+++ b/libutil/kazlib/drivers/dict-main.c
@@ -0,0 +1,300 @@
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+
+typedef char input_t[256];
+
+static int tokenize(char *string, ...)
+{
+ char **tokptr;
+ va_list arglist;
+ int tokcount = 0;
+
+ va_start(arglist, string);
+ tokptr = va_arg(arglist, char **);
+ while (tokptr) {
+ while (*string && isspace((unsigned char) *string))
+ string++;
+ if (!*string)
+ break;
+ *tokptr = string;
+ while (*string && !isspace((unsigned char) *string))
+ string++;
+ tokptr = va_arg(arglist, char **);
+ tokcount++;
+ if (!*string)
+ break;
+ *string++ = 0;
+ }
+ va_end(arglist);
+
+ return tokcount;
+}
+
+static int comparef(const void *key1, const void *key2)
+{
+ return strcmp(key1, key2);
+}
+
+static char *dupstring(char *str)
+{
+ int sz = strlen(str) + 1;
+ char *new = malloc(sz);
+ if (new)
+ memcpy(new, str, sz);
+ return new;
+}
+
+static dnode_t *new_node(void *c)
+{
+ static dnode_t few[5];
+ static int count;
+
+ if (count < 5)
+ return few + count++;
+
+ return NULL;
+}
+
+static void del_node(dnode_t *n, void *c)
+{
+}
+
+static int prompt = 0;
+
+static void construct(dict_t *d)
+{
+ input_t in;
+ int done = 0;
+ dict_load_t dl;
+ dnode_t *dn;
+ char *tok1, *tok2, *val;
+ const char *key;
+ char *help =
+ "p turn prompt on\n"
+ "q finish construction\n"
+ "a <key> <val> add new entry\n";
+
+ if (!dict_isempty(d))
+ puts("warning: dictionary not empty!");
+
+ dict_load_begin(&dl, d);
+
+ while (!done) {
+ if (prompt)
+ putchar('>');
+ fflush(stdout);
+
+ if (!fgets(in, sizeof(input_t), stdin))
+ break;
+
+ switch (in[0]) {
+ case '?':
+ puts(help);
+ break;
+ case 'p':
+ prompt = 1;
+ break;
+ case 'q':
+ done = 1;
+ break;
+ case 'a':
+ if (tokenize(in+1, &tok1, &tok2, (char **) 0) != 2) {
+ puts("what?");
+ break;
+ }
+ key = dupstring(tok1);
+ val = dupstring(tok2);
+ dn = dnode_create(val);
+
+ if (!key || !val || !dn) {
+ puts("out of memory");
+ free((void *) key);
+ free(val);
+ if (dn)
+ dnode_destroy(dn);
+ }
+
+ dict_load_next(&dl, dn, key);
+ break;
+ default:
+ putchar('?');
+ putchar('\n');
+ break;
+ }
+ }
+
+ dict_load_end(&dl);
+}
+
+int main(void)
+{
+ input_t in;
+ dict_t darray[10];
+ dict_t *d = &darray[0];
+ dnode_t *dn;
+ int i;
+ char *tok1, *tok2, *val;
+ const char *key;
+
+ char *help =
+ "a <key> <val> add value to dictionary\n"
+ "d <key> delete value from dictionary\n"
+ "l <key> lookup value in dictionary\n"
+ "( <key> lookup lower bound\n"
+ ") <key> lookup upper bound\n"
+ "# <num> switch to alternate dictionary (0-9)\n"
+ "j <num> <num> merge two dictionaries\n"
+ "f free the whole dictionary\n"
+ "k allow duplicate keys\n"
+ "c show number of entries\n"
+ "t dump whole dictionary in sort order\n"
+ "m make dictionary out of sorted items\n"
+ "p turn prompt on\n"
+ "s switch to non-functioning allocator\n"
+ "q quit";
+
+ for (i = 0; i < sizeof darray / sizeof *darray; i++)
+ dict_init(&darray[i], DICTCOUNT_T_MAX, comparef);
+
+ for (;;) {
+ if (prompt)
+ putchar('>');
+ fflush(stdout);
+
+ if (!fgets(in, sizeof(input_t), stdin))
+ break;
+
+ switch(in[0]) {
+ case '?':
+ puts(help);
+ break;
+ case 'a':
+ if (tokenize(in+1, &tok1, &tok2, (char **) 0) != 2) {
+ puts("what?");
+ break;
+ }
+ key = dupstring(tok1);
+ val = dupstring(tok2);
+
+ if (!key || !val) {
+ puts("out of memory");
+ free((void *) key);
+ free(val);
+ }
+
+ if (!dict_alloc_insert(d, key, val)) {
+ puts("dict_alloc_insert failed");
+ free((void *) key);
+ free(val);
+ break;
+ }
+ break;
+ case 'd':
+ if (tokenize(in+1, &tok1, (char **) 0) != 1) {
+ puts("what?");
+ break;
+ }
+ dn = dict_lookup(d, tok1);
+ if (!dn) {
+ puts("dict_lookup failed");
+ break;
+ }
+ val = dnode_get(dn);
+ key = dnode_getkey(dn);
+ dict_delete_free(d, dn);
+
+ free(val);
+ free((void *) key);
+ break;
+ case 'f':
+ dict_free(d);
+ break;
+ case 'l':
+ case '(':
+ case ')':
+ if (tokenize(in+1, &tok1, (char **) 0) != 1) {
+ puts("what?");
+ break;
+ }
+ dn = 0;
+ switch (in[0]) {
+ case 'l':
+ dn = dict_lookup(d, tok1);
+ break;
+ case '(':
+ dn = dict_lower_bound(d, tok1);
+ break;
+ case ')':
+ dn = dict_upper_bound(d, tok1);
+ break;
+ }
+ if (!dn) {
+ puts("lookup failed");
+ break;
+ }
+ val = dnode_get(dn);
+ puts(val);
+ break;
+ case 'm':
+ construct(d);
+ break;
+ case 'k':
+ dict_allow_dupes(d);
+ break;
+ case 'c':
+ printf("%lu\n", (unsigned long) dict_count(d));
+ break;
+ case 't':
+ for (dn = dict_first(d); dn; dn = dict_next(d, dn)) {
+ printf("%s\t%s\n", (char *) dnode_getkey(dn),
+ (char *) dnode_get(dn));
+ }
+ break;
+ case 'q':
+ exit(0);
+ break;
+ case '\0':
+ break;
+ case 'p':
+ prompt = 1;
+ break;
+ case 's':
+ dict_set_allocator(d, new_node, del_node, NULL);
+ break;
+ case '#':
+ if (tokenize(in+1, &tok1, (char **) 0) != 1) {
+ puts("what?");
+ break;
+ } else {
+ int dictnum = atoi(tok1);
+ if (dictnum < 0 || dictnum > 9) {
+ puts("invalid number");
+ break;
+ }
+ d = &darray[dictnum];
+ }
+ break;
+ case 'j':
+ if (tokenize(in+1, &tok1, &tok2, (char **) 0) != 2) {
+ puts("what?");
+ break;
+ } else {
+ int dict1 = atoi(tok1), dict2 = atoi(tok2);
+ if (dict1 < 0 || dict1 > 9 || dict2 < 0 || dict2 > 9) {
+ puts("invalid number");
+ break;
+ }
+ dict_merge(&darray[dict1], &darray[dict2]);
+ }
+ break;
+ default:
+ putchar('?');
+ putchar('\n');
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/libutil/kazlib/drivers/except-main.c b/libutil/kazlib/drivers/except-main.c
new file mode 100644
index 0000000..fdb64db
--- /dev/null
+++ b/libutil/kazlib/drivers/except-main.c
@@ -0,0 +1,57 @@
+#include <stdio.h>
+#include <ctype.h>
+
+static void cleanup(void *arg)
+{
+ printf("cleanup(\"%s\") called\n", (char *) arg);
+}
+
+static void bottom_level(void)
+{
+ char buf[256];
+ printf("throw exception? "); fflush(stdout);
+ fgets(buf, sizeof buf, stdin);
+
+ if (buf[0] >= 0 && toupper(buf[0]) == 'Y')
+ except_throw(1, 1, "nasty exception");
+}
+
+static void top_level(void)
+{
+ except_cleanup_push(cleanup, "argument");
+ bottom_level();
+ except_cleanup_pop(0);
+}
+
+int main(int argc, char **argv)
+{
+ static const except_id_t catch[] = { { 1, 1 }, { 1, 2 } };
+ except_t *ex;
+
+ /*
+ * Nested exception ``try blocks''
+ */
+
+ /* outer */
+ except_try_push(catch, 2, &ex);
+ if (!ex) {
+ /* inner */
+ except_try_push(catch, 2, &ex);
+ if (!ex) {
+ top_level();
+ } else {
+ /* inner catch */
+ printf("caught exception (inner): \"%s\", s=%ld, c=%ld\n",
+ except_message(ex), except_group(ex), except_code(ex));
+ except_rethrow(ex);
+ }
+ except_try_pop();
+ } else {
+ /* outer catch */
+ printf("caught exception (outer): \"%s\", s=%ld, c=%ld\n",
+ except_message(ex), except_group(ex), except_code(ex));
+ }
+ except_try_pop();
+ except_throw(99, 99, "exception in main");
+ return 0;
+}
diff --git a/libutil/kazlib/drivers/hash-main.c b/libutil/kazlib/drivers/hash-main.c
new file mode 100644
index 0000000..0a08542
--- /dev/null
+++ b/libutil/kazlib/drivers/hash-main.c
@@ -0,0 +1,187 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <stdarg.h>
+
+typedef char input_t[256];
+
+static int tokenize(char *string, ...)
+{
+ char **tokptr;
+ va_list arglist;
+ int tokcount = 0;
+
+ va_start(arglist, string);
+ tokptr = va_arg(arglist, char **);
+ while (tokptr) {
+ while (*string && isspace((unsigned char) *string))
+ string++;
+ if (!*string)
+ break;
+ *tokptr = string;
+ while (*string && !isspace((unsigned char) *string))
+ string++;
+ tokptr = va_arg(arglist, char **);
+ tokcount++;
+ if (!*string)
+ break;
+ *string++ = 0;
+ }
+ va_end(arglist);
+
+ return tokcount;
+}
+
+static char *dupstring(char *str)
+{
+ int sz = strlen(str) + 1;
+ char *new = malloc(sz);
+ if (new)
+ memcpy(new, str, sz);
+ return new;
+}
+
+static hnode_t *new_node(void *c)
+{
+ static hnode_t few[5];
+ static int count;
+
+ if (count < 5)
+ return few + count++;
+
+ return NULL;
+}
+
+static void del_node(hnode_t *n, void *c)
+{
+}
+
+int main(void)
+{
+ input_t in;
+ hash_t *h = hash_create(HASHCOUNT_T_MAX, 0, 0);
+ hnode_t *hn;
+ hscan_t hs;
+ char *tok1, *tok2, *val;
+ const char *key;
+ int prompt = 0;
+
+ char *help =
+ "a <key> <val> add value to hash table\n"
+ "d <key> delete value from hash table\n"
+ "l <key> lookup value in hash table\n"
+ "n show size of hash table\n"
+ "c show number of entries\n"
+ "t dump whole hash table\n"
+ "+ increase hash table (private func)\n"
+ "- decrease hash table (private func)\n"
+ "b print hash_t_bit value\n"
+ "p turn prompt on\n"
+ "s switch to non-functioning allocator\n"
+ "q quit";
+
+ if (!h)
+ puts("hash_create failed");
+
+ for (;;) {
+ if (prompt)
+ putchar('>');
+ fflush(stdout);
+
+ if (!fgets(in, sizeof(input_t), stdin))
+ break;
+
+ switch(in[0]) {
+ case '?':
+ puts(help);
+ break;
+ case 'b':
+ printf("%d\n", hash_val_t_bit);
+ break;
+ case 'a':
+ if (tokenize(in+1, &tok1, &tok2, (char **) 0) != 2) {
+ puts("what?");
+ break;
+ }
+ key = dupstring(tok1);
+ val = dupstring(tok2);
+
+ if (!key || !val) {
+ puts("out of memory");
+ free((void *) key);
+ free(val);
+ }
+
+ if (!hash_alloc_insert(h, key, val)) {
+ puts("hash_alloc_insert failed");
+ free((void *) key);
+ free(val);
+ break;
+ }
+ break;
+ case 'd':
+ if (tokenize(in+1, &tok1, (char **) 0) != 1) {
+ puts("what?");
+ break;
+ }
+ hn = hash_lookup(h, tok1);
+ if (!hn) {
+ puts("hash_lookup failed");
+ break;
+ }
+ val = hnode_get(hn);
+ key = hnode_getkey(hn);
+ hash_scan_delfree(h, hn);
+ free((void *) key);
+ free(val);
+ break;
+ case 'l':
+ if (tokenize(in+1, &tok1, (char **) 0) != 1) {
+ puts("what?");
+ break;
+ }
+ hn = hash_lookup(h, tok1);
+ if (!hn) {
+ puts("hash_lookup failed");
+ break;
+ }
+ val = hnode_get(hn);
+ puts(val);
+ break;
+ case 'n':
+ printf("%lu\n", (unsigned long) hash_size(h));
+ break;
+ case 'c':
+ printf("%lu\n", (unsigned long) hash_count(h));
+ break;
+ case 't':
+ hash_scan_begin(&hs, h);
+ while ((hn = hash_scan_next(&hs)))
+ printf("%s\t%s\n", (char*) hnode_getkey(hn),
+ (char*) hnode_get(hn));
+ break;
+ case '+':
+ grow_table(h); /* private function */
+ break;
+ case '-':
+ shrink_table(h); /* private function */
+ break;
+ case 'q':
+ exit(0);
+ break;
+ case '\0':
+ break;
+ case 'p':
+ prompt = 1;
+ break;
+ case 's':
+ hash_set_allocator(h, new_node, del_node, NULL);
+ break;
+ default:
+ putchar('?');
+ putchar('\n');
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/libutil/kazlib/drivers/list-main.c b/libutil/kazlib/drivers/list-main.c
new file mode 100644
index 0000000..6f462e4
--- /dev/null
+++ b/libutil/kazlib/drivers/list-main.c
@@ -0,0 +1,152 @@
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+
+typedef char input_t[256];
+
+static int tokenize(char *string, ...)
+{
+ char **tokptr;
+ va_list arglist;
+ int tokcount = 0;
+
+ va_start(arglist, string);
+ tokptr = va_arg(arglist, char **);
+ while (tokptr) {
+ while (*string && isspace((unsigned char) *string))
+ string++;
+ if (!*string)
+ break;
+ *tokptr = string;
+ while (*string && !isspace((unsigned char) *string))
+ string++;
+ tokptr = va_arg(arglist, char **);
+ tokcount++;
+ if (!*string)
+ break;
+ *string++ = 0;
+ }
+ va_end(arglist);
+
+ return tokcount;
+}
+
+static int comparef(const void *key1, const void *key2)
+{
+ return strcmp(key1, key2);
+}
+
+static char *dupstring(char *str)
+{
+ int sz = strlen(str) + 1;
+ char *new = malloc(sz);
+ if (new)
+ memcpy(new, str, sz);
+ return new;
+}
+
+int main(void)
+{
+ input_t in;
+ list_t *l = list_create(LISTCOUNT_T_MAX);
+ lnode_t *ln;
+ char *tok1, *val;
+ int prompt = 0;
+
+ char *help =
+ "a <val> append value to list\n"
+ "d <val> delete value from list\n"
+ "l <val> lookup value in list\n"
+ "s sort list\n"
+ "c show number of entries\n"
+ "t dump whole list\n"
+ "p turn prompt on\n"
+ "q quit";
+
+ if (!l)
+ puts("list_create failed");
+
+ for (;;) {
+ if (prompt)
+ putchar('>');
+ fflush(stdout);
+
+ if (!fgets(in, sizeof(input_t), stdin))
+ break;
+
+ switch(in[0]) {
+ case '?':
+ puts(help);
+ break;
+ case 'a':
+ if (tokenize(in+1, &tok1, (char **) 0) != 1) {
+ puts("what?");
+ break;
+ }
+ val = dupstring(tok1);
+ ln = lnode_create(val);
+
+ if (!val || !ln) {
+ puts("allocation failure");
+ if (ln)
+ lnode_destroy(ln);
+ free(val);
+ break;
+ }
+
+ list_append(l, ln);
+ break;
+ case 'd':
+ if (tokenize(in+1, &tok1, (char **) 0) != 1) {
+ puts("what?");
+ break;
+ }
+ ln = list_find(l, tok1, comparef);
+ if (!ln) {
+ puts("list_find failed");
+ break;
+ }
+ list_delete(l, ln);
+ val = lnode_get(ln);
+ lnode_destroy(ln);
+ free(val);
+ break;
+ case 'l':
+ if (tokenize(in+1, &tok1, (char **) 0) != 1) {
+ puts("what?");
+ break;
+ }
+ ln = list_find(l, tok1, comparef);
+ if (!ln)
+ puts("list_find failed");
+ else
+ puts("found");
+ break;
+ case 's':
+ list_sort(l, comparef);
+ break;
+ case 'c':
+ printf("%lu\n", (unsigned long) list_count(l));
+ break;
+ case 't':
+ for (ln = list_first(l); ln != 0; ln = list_next(l, ln))
+ puts(lnode_get(ln));
+ break;
+ case 'q':
+ exit(0);
+ break;
+ case '\0':
+ break;
+ case 'p':
+ prompt = 1;
+ break;
+ default:
+ putchar('?');
+ putchar('\n');
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/libutil/kazlib/drivers/sfx-main.c b/libutil/kazlib/drivers/sfx-main.c
new file mode 100644
index 0000000..fda683b
--- /dev/null
+++ b/libutil/kazlib/drivers/sfx-main.c
@@ -0,0 +1,41 @@
+#include <stdlib.h>
+
+int main(int argc, char **argv)
+{
+ char expr_buf[256];
+ char *expr, *ptr;
+ sfx_rating_t eff;
+
+ for (;;) {
+ if (argc < 2) {
+ expr = expr_buf;
+ if (fgets(expr_buf, sizeof expr_buf, stdin) == 0)
+ break;
+ if ((ptr = strchr(expr_buf, '\n')) != 0)
+ *ptr = 0;
+ } else {
+ expr = (argv++)[1];
+ if (!expr)
+ break;
+ }
+
+ if (!sfx_determine(expr, &eff)) {
+ printf("expression '%s' has a syntax error\n", expr);
+ return EXIT_FAILURE;
+ }
+
+ switch (eff) {
+ case sfx_none:
+ printf("expression '%s' has no side effects\n", expr);
+ break;
+ case sfx_potential:
+ printf("expression '%s' may have side effects\n", expr);
+ break;
+ case sfx_certain:
+ printf("expression '%s' has side effects\n", expr);
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/libutil/kazlib/except.c b/libutil/kazlib/except.c
new file mode 100644
index 0000000..c915dda
--- /dev/null
+++ b/libutil/kazlib/except.c
@@ -0,0 +1,347 @@
+/*
+ * Portable Exception Handling for ANSI C.
+ * Copyright (C) 1999 Kaz Kylheku <kaz at ashi.footprints.net>
+ *
+ * Free Software License:
+ *
+ * All rights are reserved by the author, with the following exceptions:
+ * Permission is granted to freely reproduce and distribute this software,
+ * possibly in exchange for a fee, provided that this copyright notice appears
+ * intact. Permission is also granted to adapt this software to produce
+ * derivative works, as long as the modified versions carry this copyright
+ * notice and additional notices stating that the work has been modified.
+ * This source code may be translated into executable form and incorporated
+ * into proprietary software; there is no requirement for such software to
+ * contain a copyright notice related to this source.
+ *
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <limits.h>
+#include "except.h"
+
+#define XCEPT_BUFFER_SIZE 1024
+
+#define group except_group
+#define code except_code
+#define id except_id
+#define message except_message
+#define dyndata except_dyndata
+#define func except_func
+#define context except_context
+#define id except_id
+#define size except_size
+#define obj except_obj
+#define jmp except_jmp
+#define down except_down
+#define type except_type
+#define catcher except_catcher
+#define cleanup except_cleanup
+#define info except_info
+
+#ifdef KAZLIB_POSIX_THREADS
+
+#include <pthread.h>
+
+static pthread_mutex_t init_mtx = PTHREAD_MUTEX_INITIALIZER;
+static int init_counter;
+static pthread_key_t top_key;
+static pthread_key_t uh_key;
+static pthread_key_t alloc_key;
+static pthread_key_t dealloc_key;
+static void unhandled_catcher(except_t *);
+
+#define get_top() ((struct except_stacknode *) pthread_getspecific(top_key))
+#define set_top(T) (pthread_setspecific(top_key, (T)), (void)((T) == (struct except_stacknode *) 0))
+#define set_catcher(C) (pthread_setspecific(uh_key, (void *) (C)), (void)((C) == (void (*)(except_t *)) 0))
+#define set_alloc(A) (pthread_setspecific(alloc_key, (void *) (A)), (void)((A) == (void *(*)(size_t)) 0))
+#define set_dealloc(D) (pthread_setspecific(dealloc_key, (void *) (D)), (void)((D) == (void (*)(void *)) 0))
+
+static void (*get_catcher(void))(except_t *)
+{
+ void (*catcher)(except_t *) = (void (*)(except_t *)) pthread_getspecific(uh_key);
+ return (catcher == 0) ? unhandled_catcher : catcher;
+}
+
+static void *(*get_alloc(void))(size_t)
+{
+ void *(*alloc)(size_t) = (void *(*)(size_t)) pthread_getspecific(alloc_key);
+ return (alloc == 0) ? malloc : alloc;
+}
+
+static void (*get_dealloc(void))(void *)
+{
+ void (*dealloc)(void *) = (void (*)(void *)) pthread_getspecific(dealloc_key);
+ return (dealloc == 0) ? free : dealloc;
+}
+
+int except_init(void)
+{
+ int retval = 1;
+
+ pthread_mutex_lock(&init_mtx);
+
+ assert (init_counter < INT_MAX);
+
+ if (init_counter++ == 0) {
+ int top_ok = (pthread_key_create(&top_key, 0) == 0);
+ int uh_ok = (pthread_key_create(&uh_key, 0) == 0);
+ int alloc_ok = (pthread_key_create(&alloc_key, 0) == 0);
+ int dealloc_ok = (pthread_key_create(&dealloc_key, 0) == 0);
+
+ if (!top_ok || !uh_ok || !alloc_ok || !dealloc_ok) {
+ retval = 0;
+ init_counter = 0;
+ if (top_ok)
+ pthread_key_delete(top_key);
+ if (uh_ok)
+ pthread_key_delete(uh_key);
+ if (alloc_ok)
+ pthread_key_delete(alloc_key);
+ if (dealloc_ok)
+ pthread_key_delete(dealloc_key);
+ }
+ }
+
+ pthread_mutex_unlock(&init_mtx);
+
+ return retval;
+}
+
+void except_deinit(void)
+{
+ pthread_mutex_lock(&init_mtx);
+
+ assert (init_counter > 0);
+
+ if (--init_counter == 0) {
+ pthread_key_delete(top_key);
+ pthread_key_delete(uh_key);
+ pthread_key_delete(alloc_key);
+ pthread_key_delete(dealloc_key);
+ }
+
+ pthread_mutex_unlock(&init_mtx);
+}
+
+#else /* no thread support */
+
+static int init_counter;
+static void unhandled_catcher(except_t *);
+static void (*uh_catcher_ptr)(except_t *) = unhandled_catcher;
+static void *(*allocator)(size_t) = malloc;
+static void (*deallocator)(void *) = free;
+static struct except_stacknode *stack_top;
+
+#define get_top() (stack_top)
+#define set_top(T) (stack_top = (T))
+#define get_catcher() (uh_catcher_ptr)
+#define set_catcher(C) (uh_catcher_ptr = (C))
+#define get_alloc() (allocator)
+#define set_alloc(A) (allocator = (A))
+#define get_dealloc() (deallocator)
+#define set_dealloc(D) (deallocator = (D))
+
+int except_init(void)
+{
+ assert (init_counter < INT_MAX);
+ init_counter++;
+ return 1;
+}
+
+void except_deinit(void)
+{
+ assert (init_counter > 0);
+ init_counter--;
+}
+
+#endif
+
+
+static int match(const volatile except_id_t *thrown, const except_id_t *caught)
+{
+ int group_match = (caught->group == XCEPT_GROUP_ANY || caught->group == thrown->group);
+ int code_match = (caught->code == XCEPT_CODE_ANY || caught->code == thrown->code);
+
+ return group_match && code_match;
+}
+
+static void do_throw(except_t *except)
+{
+ struct except_stacknode *top;
+
+ assert (except->id.group != 0 && except->id.code != 0);
+
+ for (top = get_top(); top != 0; top = top->down) {
+ if (top->type == XCEPT_CLEANUP) {
+ top->info.cleanup->func(top->info.cleanup->context);
+ } else {
+ struct except_catch *catcher = top->info.catcher;
+ const except_id_t *pi = catcher->id;
+ size_t i;
+
+ assert (top->type == XCEPT_CATCHER);
+ except_free(catcher->obj.dyndata);
+
+ for (i = 0; i < catcher->size; pi++, i++) {
+ if (match(&except->id, pi)) {
+ catcher->obj = *except;
+ set_top(top);
+ longjmp(catcher->jmp, 1);
+ }
+ }
+ }
+ }
+
+ set_top(top);
+ get_catcher()(except); /* unhandled exception */
+ abort();
+}
+
+static void unhandled_catcher(except_t *except)
+{
+ fprintf(stderr, "Unhandled exception (\"%s\", group=%ld, code=%ld)\n",
+ except->message, except->id.group, except->id.code);
+ abort();
+}
+
+static void stack_push(struct except_stacknode *node)
+{
+ node->down = get_top();
+ set_top(node);
+}
+
+void except_setup_clean(struct except_stacknode *esn,
+ struct except_cleanup *ecl, void (*cleanf)(void *), void *context)
+{
+ esn->type = XCEPT_CLEANUP;
+ ecl->func = cleanf;
+ ecl->context = context;
+ esn->info.cleanup = ecl;
+ stack_push(esn);
+}
+
+void except_setup_try(struct except_stacknode *esn,
+ struct except_catch *ech, const except_id_t id[], size_t size)
+{
+ ech->id = id;
+ ech->size = size;
+ ech->obj.dyndata = 0;
+ esn->type = XCEPT_CATCHER;
+ esn->info.catcher = ech;
+ stack_push(esn);
+}
+
+struct except_stacknode *except_pop(void)
+{
+ struct except_stacknode *top = get_top();
+ set_top(top->down);
+ return top;
+}
+
+void except_rethrow(except_t *except)
+{
+ struct except_stacknode *top = get_top();
+ assert (top != 0);
+ assert (top->type == XCEPT_CATCHER);
+ assert (&top->info.catcher->obj == except);
+ set_top(top->down);
+ do_throw(except);
+}
+
+void except_throw(long group, long code, const char *msg)
+{
+ except_t except;
+
+ except.id.group = group;
+ except.id.code = code;
+ except.message = msg;
+ except.dyndata = 0;
+
+ do_throw(&except);
+}
+
+void except_throwd(long group, long code, const char *msg, void *data)
+{
+ except_t except;
+
+ except.id.group = group;
+ except.id.code = code;
+ except.message = msg;
+ except.dyndata = data;
+
+ do_throw(&except);
+}
+
+void except_throwf(long group, long code, const char *fmt, ...)
+{
+ char *buf = except_alloc(XCEPT_BUFFER_SIZE);
+ va_list vl;
+
+ va_start (vl, fmt);
+ vsprintf(buf, fmt, vl);
+ va_end (vl);
+ except_throwd(group, code, buf, buf);
+}
+
+void (*except_unhandled_catcher(void (*new_catcher)(except_t *)))(except_t *)
+{
+ void (*old_catcher)(except_t *) = get_catcher();
+ set_catcher(new_catcher);
+ return old_catcher;
+}
+
+#undef except_code
+#undef except_group
+#undef except_message
+#undef except_data
+
+unsigned long except_code(except_t *ex)
+{
+ return ex->id.code;
+}
+
+unsigned long except_group(except_t *ex)
+{
+ return ex->id.group;
+}
+
+const char *except_message(except_t *ex)
+{
+ return ex->message;
+}
+
+void *except_data(except_t *ex)
+{
+ return ex->dyndata;
+}
+
+void *except_take_data(except_t *ex)
+{
+ void *data = ex->dyndata;
+ ex->dyndata = 0;
+ return data;
+}
+
+void except_set_allocator(void *(*alloc)(size_t), void (*dealloc)(void *))
+{
+ set_alloc(alloc);
+ set_dealloc(dealloc);
+}
+
+void *except_alloc(size_t size)
+{
+ void *ptr = get_alloc()(size);
+
+ if (ptr == 0)
+ except_throw(XCEPT_BAD_ALLOC, 0, "out of memory");
+ return ptr;
+}
+
+void except_free(void *ptr)
+{
+ get_dealloc()(ptr);
+}
diff --git a/libutil/kazlib/except.h b/libutil/kazlib/except.h
new file mode 100644
index 0000000..3131fb9
--- /dev/null
+++ b/libutil/kazlib/except.h
@@ -0,0 +1,147 @@
+/*
+ * Portable Exception Handling for ANSI C.
+ * Copyright (C) 1999 Kaz Kylheku <kaz at ashi.footprints.net>
+ *
+ * Free Software License:
+ *
+ * All rights are reserved by the author, with the following exceptions:
+ * Permission is granted to freely reproduce and distribute this software,
+ * possibly in exchange for a fee, provided that this copyright notice appears
+ * intact. Permission is also granted to adapt this software to produce
+ * derivative works, as long as the modified versions carry this copyright
+ * notice and additional notices stating that the work has been modified.
+ * This source code may be translated into executable form and incorporated
+ * into proprietary software; there is no requirement for such software to
+ * contain a copyright notice related to this source.
+ *
+ */
+
+#ifndef XCEPT_H
+#define XCEPT_H
+
+#include <setjmp.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define XCEPT_GROUP_ANY 0
+#define XCEPT_CODE_ANY 0
+#define XCEPT_BAD_ALLOC 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum { except_no_call, except_call };
+
+typedef struct {
+ unsigned long except_group;
+ unsigned long except_code;
+} except_id_t;
+
+typedef struct {
+ except_id_t volatile except_id;
+ const char *volatile except_message;
+ void *volatile except_dyndata;
+} except_t;
+
+struct except_cleanup {
+ void (*except_func)(void *);
+ void *except_context;
+};
+
+struct except_catch {
+ const except_id_t *except_id;
+ size_t except_size;
+ except_t except_obj;
+ jmp_buf except_jmp;
+};
+
+enum except_stacktype {
+ XCEPT_CLEANUP, XCEPT_CATCHER
+};
+
+struct except_stacknode {
+ struct except_stacknode *except_down;
+ enum except_stacktype except_type;
+ union {
+ struct except_catch *except_catcher;
+ struct except_cleanup *except_cleanup;
+ } except_info;
+};
+
+/* private functions made external so they can be used in macros */
+void except_setup_clean(struct except_stacknode *,
+ struct except_cleanup *, void (*)(void *), void *);
+void except_setup_try(struct except_stacknode *,
+ struct except_catch *, const except_id_t [], size_t);
+struct except_stacknode *except_pop(void);
+
+/* public interface functions */
+int except_init(void);
+void except_deinit(void);
+void except_rethrow(except_t *);
+void except_throw(long, long, const char *);
+void except_throwd(long, long, const char *, void *);
+void except_throwf(long, long, const char *, ...);
+void (*except_unhandled_catcher(void (*)(except_t *)))(except_t *);
+unsigned long except_code(except_t *);
+unsigned long except_group(except_t *);
+const char *except_message(except_t *);
+void *except_data(except_t *);
+void *except_take_data(except_t *);
+void except_set_allocator(void *(*)(size_t), void (*)(void *));
+void *except_alloc(size_t);
+void except_free(void *);
+
+#define except_code(E) ((E)->except_id.except_code)
+#define except_group(E) ((E)->except_id.except_group)
+#define except_message(E) ((E)->except_message)
+#define except_data(E) ((E)->except_dyndata)
+
+#ifdef __cplusplus
+}
+#endif
+
+/*
+ * void except_cleanup_push(void (*)(void *), void *);
+ * void except_cleanup_pop(int);
+ * void except_checked_cleanup_pop(void (*)(void *), int);
+ * void except_try_push(const except_id_t [], size_t, except_t **);
+ * void except_try_pop(void);
+ */
+
+#define except_cleanup_push(F, C) \
+ { \
+ struct except_stacknode except_sn; \
+ struct except_cleanup except_cl; \
+ except_setup_clean(&except_sn, &except_cl, F, C)
+
+#define except_cleanup_pop(E) \
+ except_pop(); \
+ if (E) \
+ except_cl.except_func(except_cl.except_context); \
+ }
+
+#define except_checked_cleanup_pop(F, E) \
+ except_pop(); \
+ assert (except_cl.except_func == (F)); \
+ if (E) \
+ except_cl.except_func(except_cl.except_context); \
+ }
+
+#define except_try_push(ID, NUM, PPE) \
+ { \
+ struct except_stacknode except_sn; \
+ struct except_catch except_ch; \
+ except_setup_try(&except_sn, &except_ch, ID, NUM); \
+ if (setjmp(except_ch.except_jmp)) \
+ *(PPE) = &except_ch.except_obj; \
+ else \
+ *(PPE) = 0
+
+#define except_try_pop() \
+ except_free(except_ch.except_obj.except_dyndata); \
+ except_pop(); \
+ }
+
+#endif
diff --git a/libutil/kazlib/hash.c b/libutil/kazlib/hash.c
new file mode 100644
index 0000000..2140e66
--- /dev/null
+++ b/libutil/kazlib/hash.c
@@ -0,0 +1,837 @@
+/*
+ * Hash Table Data Type
+ * Copyright (C) 1997 Kaz Kylheku <kaz at ashi.footprints.net>
+ *
+ * Free Software License:
+ *
+ * All rights are reserved by the author, with the following exceptions:
+ * Permission is granted to freely reproduce and distribute this software,
+ * possibly in exchange for a fee, provided that this copyright notice appears
+ * intact. Permission is also granted to adapt this software to produce
+ * derivative works, as long as the modified versions carry this copyright
+ * notice and additional notices stating that the work has been modified.
+ * This source code may be translated into executable form and incorporated
+ * into proprietary software; there is no requirement for such software to
+ * contain a copyright notice related to this source.
+ *
+ */
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <assert.h>
+#include <string.h>
+#define HASH_IMPLEMENTATION
+#include "hash.h"
+
+#define INIT_BITS 6
+#define INIT_SIZE (1UL << (INIT_BITS)) /* must be power of two */
+#define INIT_MASK ((INIT_SIZE) - 1)
+
+#define next hash_next
+#define key hash_key
+#define data hash_data
+#define hkey hash_hkey
+
+#define table hash_table
+#define nchains hash_nchains
+#define nodecount hash_nodecount
+#define maxcount hash_maxcount
+#define highmark hash_highmark
+#define lowmark hash_lowmark
+#define compare hash_compare
+#define function hash_function
+#define allocnode hash_allocnode
+#define freenode hash_freenode
+#define context hash_context
+#define mask hash_mask
+#define dynamic hash_dynamic
+
+#define table hash_table
+#define chain hash_chain
+
+static hnode_t *hnode_alloc(void *context);
+static void hnode_free(hnode_t *node, void *context);
+static hash_val_t hash_fun_default(const void *key);
+static int hash_comp_default(const void *key1, const void *key2);
+
+int hash_val_t_bit;
+
+/*
+ * Compute the number of bits in the hash_val_t type. We know that hash_val_t
+ * is an unsigned integral type. Thus the highest value it can hold is a
+ * Mersenne number (power of two, less one). We initialize a hash_val_t
+ * object with this value and then shift bits out one by one while counting.
+ * Notes:
+ * 1. HASH_VAL_T_MAX is a Mersenne number---one that is one less than a power
+ * of two. This means that its binary representation consists of all one
+ * bits, and hence ``val'' is initialized to all one bits.
+ * 2. While bits remain in val, we increment the bit count and shift it to the
+ * right, replacing the topmost bit by zero.
+ */
+
+static void compute_bits(void)
+{
+ hash_val_t val = HASH_VAL_T_MAX; /* 1 */
+ int bits = 0;
+
+ while (val) { /* 2 */
+ bits++;
+ val >>= 1;
+ }
+
+ hash_val_t_bit = bits;
+}
+
+/*
+ * Verify whether the given argument is a power of two.
+ */
+
+static int is_power_of_two(hash_val_t arg)
+{
+ if (arg == 0)
+ return 0;
+ while ((arg & 1) == 0)
+ arg >>= 1;
+ return (arg == 1);
+}
+
+/*
+ * Compute a shift amount from a given table size
+ */
+
+static hash_val_t compute_mask(hashcount_t size)
+{
+ assert (is_power_of_two(size));
+ assert (size >= 2);
+
+ return size - 1;
+}
+
+/*
+ * Initialize the table of pointers to null.
+ */
+
+static void clear_table(hash_t *hash)
+{
+ hash_val_t i;
+
+ for (i = 0; i < hash->nchains; i++)
+ hash->table[i] = NULL;
+}
+
+/*
+ * Double the size of a dynamic table. This works as follows. Each chain splits
+ * into two adjacent chains. The shift amount increases by one, exposing an
+ * additional bit of each hashed key. For each node in the original chain, the
+ * value of this newly exposed bit will decide which of the two new chains will
+ * receive the node: if the bit is 1, the chain with the higher index will have
+ * the node, otherwise the lower chain will receive the node. In this manner,
+ * the hash table will continue to function exactly as before without having to
+ * rehash any of the keys.
+ * Notes:
+ * 1. Overflow check.
+ * 2. The new number of chains is twice the old number of chains.
+ * 3. The new mask is one bit wider than the previous, revealing a
+ * new bit in all hashed keys.
+ * 4. Allocate a new table of chain pointers that is twice as large as the
+ * previous one.
+ * 5. If the reallocation was successful, we perform the rest of the growth
+ * algorithm, otherwise we do nothing.
+ * 6. The exposed_bit variable holds a mask with which each hashed key can be
+ * AND-ed to test the value of its newly exposed bit.
+ * 7. Now loop over each chain in the table and sort its nodes into two
+ * chains based on the value of each node's newly exposed hash bit.
+ * 8. The low chain replaces the current chain. The high chain goes
+ * into the corresponding sister chain in the upper half of the table.
+ * 9. We have finished dealing with the chains and nodes. We now update
+ * the various bookeeping fields of the hash structure.
+ */
+
+static void grow_table(hash_t *hash)
+{
+ hnode_t **newtable;
+
+ assert (2 * hash->nchains > hash->nchains); /* 1 */
+
+ newtable = realloc(hash->table,
+ sizeof *newtable * hash->nchains * 2); /* 4 */
+
+ if (newtable) { /* 5 */
+ hash_val_t mask = (hash->mask << 1) | 1; /* 3 */
+ hash_val_t exposed_bit = mask ^ hash->mask; /* 6 */
+ hash_val_t chain;
+
+ assert (mask != hash->mask);
+
+ for (chain = 0; chain < hash->nchains; chain++) { /* 7 */
+ hnode_t *low_chain = 0, *high_chain = 0, *hptr, *next;
+
+ for (hptr = newtable[chain]; hptr != 0; hptr = next) {
+ next = hptr->next;
+
+ if (hptr->hkey & exposed_bit) {
+ hptr->next = high_chain;
+ high_chain = hptr;
+ } else {
+ hptr->next = low_chain;
+ low_chain = hptr;
+ }
+ }
+
+ newtable[chain] = low_chain; /* 8 */
+ newtable[chain + hash->nchains] = high_chain;
+ }
+
+ hash->table = newtable; /* 9 */
+ hash->mask = mask;
+ hash->nchains *= 2;
+ hash->lowmark *= 2;
+ hash->highmark *= 2;
+ }
+ assert (hash_verify(hash));
+}
+
+/*
+ * Cut a table size in half. This is done by folding together adjacent chains
+ * and populating the lower half of the table with these chains. The chains are
+ * simply spliced together. Once this is done, the whole table is reallocated
+ * to a smaller object.
+ * Notes:
+ * 1. It is illegal to have a hash table with one slot. This would mean that
+ * hash->shift is equal to hash_val_t_bit, an illegal shift value.
+ * Also, other things could go wrong, such as hash->lowmark becoming zero.
+ * 2. Looping over each pair of sister chains, the low_chain is set to
+ * point to the head node of the chain in the lower half of the table,
+ * and high_chain points to the head node of the sister in the upper half.
+ * 3. The intent here is to compute a pointer to the last node of the
+ * lower chain into the low_tail variable. If this chain is empty,
+ * low_tail ends up with a null value.
+ * 4. If the lower chain is not empty, we simply tack the upper chain onto it.
+ * If the upper chain is a null pointer, nothing happens.
+ * 5. Otherwise if the lower chain is empty but the upper one is not,
+ * If the low chain is empty, but the high chain is not, then the
+ * high chain is simply transferred to the lower half of the table.
+ * 6. Otherwise if both chains are empty, there is nothing to do.
+ * 7. All the chain pointers are in the lower half of the table now, so
+ * we reallocate it to a smaller object. This, of course, invalidates
+ * all pointer-to-pointers which reference into the table from the
+ * first node of each chain.
+ * 8. Though it's unlikely, the reallocation may fail. In this case we
+ * pretend that the table _was_ reallocated to a smaller object.
+ * 9. Finally, update the various table parameters to reflect the new size.
+ */
+
+static void shrink_table(hash_t *hash)
+{
+ hash_val_t chain, nchains;
+ hnode_t **newtable, *low_tail, *low_chain, *high_chain;
+
+ assert (hash->nchains >= 2); /* 1 */
+ nchains = hash->nchains / 2;
+
+ for (chain = 0; chain < nchains; chain++) {
+ low_chain = hash->table[chain]; /* 2 */
+ high_chain = hash->table[chain + nchains];
+ for (low_tail = low_chain; low_tail && low_tail->next; low_tail = low_tail->next)
+ ; /* 3 */
+ if (low_chain != 0) /* 4 */
+ low_tail->next = high_chain;
+ else if (high_chain != 0) /* 5 */
+ hash->table[chain] = high_chain;
+ else
+ assert (hash->table[chain] == NULL); /* 6 */
+ }
+ newtable = realloc(hash->table,
+ sizeof *newtable * nchains); /* 7 */
+ if (newtable) /* 8 */
+ hash->table = newtable;
+ hash->mask >>= 1; /* 9 */
+ hash->nchains = nchains;
+ hash->lowmark /= 2;
+ hash->highmark /= 2;
+ assert (hash_verify(hash));
+}
+
+
+/*
+ * Create a dynamic hash table. Both the hash table structure and the table
+ * itself are dynamically allocated. Furthermore, the table is extendible in
+ * that it will automatically grow as its load factor increases beyond a
+ * certain threshold.
+ * Notes:
+ * 1. If the number of bits in the hash_val_t type has not been computed yet,
+ * we do so here, because this is likely to be the first function that the
+ * user calls.
+ * 2. Allocate a hash table control structure.
+ * 3. If a hash table control structure is successfully allocated, we
+ * proceed to initialize it. Otherwise we return a null pointer.
+ * 4. We try to allocate the table of hash chains.
+ * 5. If we were able to allocate the hash chain table, we can finish
+ * initializing the hash structure and the table. Otherwise, we must
+ * backtrack by freeing the hash structure.
+ * 6. INIT_SIZE should be a power of two. The high and low marks are always set
+ * to be twice the table size and half the table size respectively. When the
+ * number of nodes in the table grows beyond the high size (beyond load
+ * factor 2), it will double in size to cut the load factor down to about
+ * about 1. If the table shrinks down to or beneath load factor 0.5,
+ * it will shrink, bringing the load up to about 1. However, the table
+ * will never shrink beneath INIT_SIZE even if it's emptied.
+ * 7. This indicates that the table is dynamically allocated and dynamically
+ * resized on the fly. A table that has this value set to zero is
+ * assumed to be statically allocated and will not be resized.
+ * 8. The table of chains must be properly reset to all null pointers.
+ */
+
+hash_t *hash_create(hashcount_t maxcount, hash_comp_t compfun,
+ hash_fun_t hashfun)
+{
+ hash_t *hash;
+
+ if (hash_val_t_bit == 0) /* 1 */
+ compute_bits();
+
+ hash = malloc(sizeof *hash); /* 2 */
+
+ if (hash) { /* 3 */
+ hash->table = malloc(sizeof *hash->table * INIT_SIZE); /* 4 */
+ if (hash->table) { /* 5 */
+ hash->nchains = INIT_SIZE; /* 6 */
+ hash->highmark = INIT_SIZE * 2;
+ hash->lowmark = INIT_SIZE / 2;
+ hash->nodecount = 0;
+ hash->maxcount = maxcount;
+ hash->compare = compfun ? compfun : hash_comp_default;
+ hash->function = hashfun ? hashfun : hash_fun_default;
+ hash->allocnode = hnode_alloc;
+ hash->freenode = hnode_free;
+ hash->context = NULL;
+ hash->mask = INIT_MASK;
+ hash->dynamic = 1; /* 7 */
+ clear_table(hash); /* 8 */
+ assert (hash_verify(hash));
+ return hash;
+ }
+ free(hash);
+ }
+
+ return NULL;
+}
+
+/*
+ * Select a different set of node allocator routines.
+ */
+
+void hash_set_allocator(hash_t *hash, hnode_alloc_t al,
+ hnode_free_t fr, void *context)
+{
+ assert (hash_count(hash) == 0);
+ assert ((al == 0 && fr == 0) || (al != 0 && fr != 0));
+
+ hash->allocnode = al ? al : hnode_alloc;
+ hash->freenode = fr ? fr : hnode_free;
+ hash->context = context;
+}
+
+/*
+ * Free every node in the hash using the hash->freenode() function pointer, and
+ * cause the hash to become empty.
+ */
+
+void hash_free_nodes(hash_t *hash)
+{
+ hscan_t hs;
+ hnode_t *node;
+ hash_scan_begin(&hs, hash);
+ while ((node = hash_scan_next(&hs))) {
+ hash_scan_delete(hash, node);
+ hash->freenode(node, hash->context);
+ }
+ hash->nodecount = 0;
+ clear_table(hash);
+}
+
+/*
+ * Obsolescent function for removing all nodes from a table,
+ * freeing them and then freeing the table all in one step.
+ */
+
+void hash_free(hash_t *hash)
+{
+#ifdef KAZLIB_OBSOLESCENT_DEBUG
+ assert ("call to obsolescent function hash_free()" && 0);
+#endif
+ hash_free_nodes(hash);
+ hash_destroy(hash);
+}
+
+/*
+ * Free a dynamic hash table structure.
+ */
+
+void hash_destroy(hash_t *hash)
+{
+ assert (hash_val_t_bit != 0);
+ assert (hash_isempty(hash));
+ free(hash->table);
+ free(hash);
+}
+
+/*
+ * Initialize a user supplied hash structure. The user also supplies a table of
+ * chains which is assigned to the hash structure. The table is static---it
+ * will not grow or shrink.
+ * 1. See note 1. in hash_create().
+ * 2. The user supplied array of pointers hopefully contains nchains nodes.
+ * 3. See note 7. in hash_create().
+ * 4. We must dynamically compute the mask from the given power of two table
+ * size.
+ * 5. The user supplied table can't be assumed to contain null pointers,
+ * so we reset it here.
+ */
+
+hash_t *hash_init(hash_t *hash, hashcount_t maxcount,
+ hash_comp_t compfun, hash_fun_t hashfun, hnode_t **table,
+ hashcount_t nchains)
+{
+ if (hash_val_t_bit == 0) /* 1 */
+ compute_bits();
+
+ assert (is_power_of_two(nchains));
+
+ hash->table = table; /* 2 */
+ hash->nchains = nchains;
+ hash->nodecount = 0;
+ hash->maxcount = maxcount;
+ hash->compare = compfun ? compfun : hash_comp_default;
+ hash->function = hashfun ? hashfun : hash_fun_default;
+ hash->dynamic = 0; /* 3 */
+ hash->mask = compute_mask(nchains); /* 4 */
+ clear_table(hash); /* 5 */
+
+ assert (hash_verify(hash));
+
+ return hash;
+}
+
+/*
+ * Reset the hash scanner so that the next element retrieved by
+ * hash_scan_next() shall be the first element on the first non-empty chain.
+ * Notes:
+ * 1. Locate the first non empty chain.
+ * 2. If an empty chain is found, remember which one it is and set the next
+ * pointer to refer to its first element.
+ * 3. Otherwise if a chain is not found, set the next pointer to NULL
+ * so that hash_scan_next() shall indicate failure.
+ */
+
+void hash_scan_begin(hscan_t *scan, hash_t *hash)
+{
+ hash_val_t nchains = hash->nchains;
+ hash_val_t chain;
+
+ scan->table = hash;
+
+ /* 1 */
+
+ for (chain = 0; chain < nchains && hash->table[chain] == 0; chain++)
+ ;
+
+ if (chain < nchains) { /* 2 */
+ scan->chain = chain;
+ scan->next = hash->table[chain];
+ } else { /* 3 */
+ scan->next = NULL;
+ }
+}
+
+/*
+ * Retrieve the next node from the hash table, and update the pointer
+ * for the next invocation of hash_scan_next().
+ * Notes:
+ * 1. Remember the next pointer in a temporary value so that it can be
+ * returned.
+ * 2. This assertion essentially checks whether the module has been properly
+ * initialized. The first point of interaction with the module should be
+ * either hash_create() or hash_init(), both of which set hash_val_t_bit to
+ * a non zero value.
+ * 3. If the next pointer we are returning is not NULL, then the user is
+ * allowed to call hash_scan_next() again. We prepare the new next pointer
+ * for that call right now. That way the user is allowed to delete the node
+ * we are about to return, since we will no longer be needing it to locate
+ * the next node.
+ * 4. If there is a next node in the chain (next->next), then that becomes the
+ * new next node, otherwise ...
+ * 5. We have exhausted the current chain, and must locate the next subsequent
+ * non-empty chain in the table.
+ * 6. If a non-empty chain is found, the first element of that chain becomes
+ * the new next node. Otherwise there is no new next node and we set the
+ * pointer to NULL so that the next time hash_scan_next() is called, a null
+ * pointer shall be immediately returned.
+ */
+
+
+hnode_t *hash_scan_next(hscan_t *scan)
+{
+ hnode_t *next = scan->next; /* 1 */
+ hash_t *hash = scan->table;
+ hash_val_t chain = scan->chain + 1;
+ hash_val_t nchains = hash->nchains;
+
+ assert (hash_val_t_bit != 0); /* 2 */
+
+ if (next) { /* 3 */
+ if (next->next) { /* 4 */
+ scan->next = next->next;
+ } else {
+ while (chain < nchains && hash->table[chain] == 0) /* 5 */
+ chain++;
+ if (chain < nchains) { /* 6 */
+ scan->chain = chain;
+ scan->next = hash->table[chain];
+ } else {
+ scan->next = NULL;
+ }
+ }
+ }
+ return next;
+}
+
+/*
+ * Insert a node into the hash table.
+ * Notes:
+ * 1. It's illegal to insert more than the maximum number of nodes. The client
+ * should verify that the hash table is not full before attempting an
+ * insertion.
+ * 2. The same key may not be inserted into a table twice.
+ * 3. If the table is dynamic and the load factor is already at >= 2,
+ * grow the table.
+ * 4. We take the bottom N bits of the hash value to derive the chain index,
+ * where N is the base 2 logarithm of the size of the hash table.
+ */
+
+void hash_insert(hash_t *hash, hnode_t *node, const void *key)
+{
+ hash_val_t hkey, chain;
+
+ assert (hash_val_t_bit != 0);
+ assert (node->next == NULL);
+ assert (hash->nodecount < hash->maxcount); /* 1 */
+ assert (hash_lookup(hash, key) == NULL); /* 2 */
+
+ if (hash->dynamic && hash->nodecount >= hash->highmark) /* 3 */
+ grow_table(hash);
+
+ hkey = hash->function(key);
+ chain = hkey & hash->mask; /* 4 */
+
+ node->key = key;
+ node->hkey = hkey;
+ node->next = hash->table[chain];
+ hash->table[chain] = node;
+ hash->nodecount++;
+
+ assert (hash_verify(hash));
+}
+
+/*
+ * Find a node in the hash table and return a pointer to it.
+ * Notes:
+ * 1. We hash the key and keep the entire hash value. As an optimization, when
+ * we descend down the chain, we can compare hash values first and only if
+ * hash values match do we perform a full key comparison.
+ * 2. To locate the chain from among 2^N chains, we look at the lower N bits of
+ * the hash value by anding them with the current mask.
+ * 3. Looping through the chain, we compare the stored hash value inside each
+ * node against our computed hash. If they match, then we do a full
+ * comparison between the unhashed keys. If these match, we have located the
+ * entry.
+ */
+
+hnode_t *hash_lookup(hash_t *hash, const void *key)
+{
+ hash_val_t hkey, chain;
+ hnode_t *nptr;
+
+ hkey = hash->function(key); /* 1 */
+ chain = hkey & hash->mask; /* 2 */
+
+ for (nptr = hash->table[chain]; nptr; nptr = nptr->next) { /* 3 */
+ if (nptr->hkey == hkey && hash->compare(nptr->key, key) == 0)
+ return nptr;
+ }
+
+ return NULL;
+}
+
+/*
+ * Delete the given node from the hash table. Since the chains
+ * are singly linked, we must locate the start of the node's chain
+ * and traverse.
+ * Notes:
+ * 1. The node must belong to this hash table, and its key must not have
+ * been tampered with.
+ * 2. If this deletion will take the node count below the low mark, we
+ * shrink the table now.
+ * 3. Determine which chain the node belongs to, and fetch the pointer
+ * to the first node in this chain.
+ * 4. If the node being deleted is the first node in the chain, then
+ * simply update the chain head pointer.
+ * 5. Otherwise advance to the node's predecessor, and splice out
+ * by updating the predecessor's next pointer.
+ * 6. Indicate that the node is no longer in a hash table.
+ */
+
+hnode_t *hash_delete(hash_t *hash, hnode_t *node)
+{
+ hash_val_t chain;
+ hnode_t *hptr;
+
+ assert (hash_lookup(hash, node->key) == node); /* 1 */
+ assert (hash_val_t_bit != 0);
+
+ if (hash->dynamic && hash->nodecount <= hash->lowmark
+ && hash->nodecount > INIT_SIZE)
+ shrink_table(hash); /* 2 */
+
+ chain = node->hkey & hash->mask; /* 3 */
+ hptr = hash->table[chain];
+
+ if (hptr == node) { /* 4 */
+ hash->table[chain] = node->next;
+ } else {
+ while (hptr->next != node) { /* 5 */
+ assert (hptr != 0);
+ hptr = hptr->next;
+ }
+ assert (hptr->next == node);
+ hptr->next = node->next;
+ }
+
+ hash->nodecount--;
+ assert (hash_verify(hash));
+
+ node->next = NULL; /* 6 */
+ return node;
+}
+
+int hash_alloc_insert(hash_t *hash, const void *key, void *data)
+{
+ hnode_t *node = hash->allocnode(hash->context);
+
+ if (node) {
+ hnode_init(node, data);
+ hash_insert(hash, node, key);
+ return 1;
+ }
+ return 0;
+}
+
+void hash_delete_free(hash_t *hash, hnode_t *node)
+{
+ hash_delete(hash, node);
+ hash->freenode(node, hash->context);
+}
+
+/*
+ * Exactly like hash_delete, except does not trigger table shrinkage. This is to be
+ * used from within a hash table scan operation. See notes for hash_delete.
+ */
+
+hnode_t *hash_scan_delete(hash_t *hash, hnode_t *node)
+{
+ hash_val_t chain;
+ hnode_t *hptr;
+
+ assert (hash_lookup(hash, node->key) == node);
+ assert (hash_val_t_bit != 0);
+
+ chain = node->hkey & hash->mask;
+ hptr = hash->table[chain];
+
+ if (hptr == node) {
+ hash->table[chain] = node->next;
+ } else {
+ while (hptr->next != node)
+ hptr = hptr->next;
+ hptr->next = node->next;
+ }
+
+ hash->nodecount--;
+ assert (hash_verify(hash));
+ node->next = NULL;
+
+ return node;
+}
+
+/*
+ * Like hash_delete_free but based on hash_scan_delete.
+ */
+
+void hash_scan_delfree(hash_t *hash, hnode_t *node)
+{
+ hash_scan_delete(hash, node);
+ hash->freenode(node, hash->context);
+}
+
+/*
+ * Verify whether the given object is a valid hash table. This means
+ * Notes:
+ * 1. If the hash table is dynamic, verify whether the high and
+ * low expansion/shrinkage thresholds are powers of two.
+ * 2. Count all nodes in the table, and test each hash value
+ * to see whether it is correct for the node's chain.
+ */
+
+int hash_verify(hash_t *hash)
+{
+ hashcount_t count = 0;
+ hash_val_t chain;
+ hnode_t *hptr;
+
+ if (hash->dynamic) { /* 1 */
+ if (hash->lowmark >= hash->highmark)
+ return 0;
+ if (!is_power_of_two(hash->highmark))
+ return 0;
+ if (!is_power_of_two(hash->lowmark))
+ return 0;
+ }
+
+ for (chain = 0; chain < hash->nchains; chain++) { /* 2 */
+ for (hptr = hash->table[chain]; hptr != 0; hptr = hptr->next) {
+ if ((hptr->hkey & hash->mask) != chain)
+ return 0;
+ count++;
+ }
+ }
+
+ if (count != hash->nodecount)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Test whether the hash table is full and return 1 if this is true,
+ * 0 if it is false.
+ */
+
+#undef hash_isfull
+int hash_isfull(hash_t *hash)
+{
+ return hash->nodecount == hash->maxcount;
+}
+
+/*
+ * Test whether the hash table is empty and return 1 if this is true,
+ * 0 if it is false.
+ */
+
+#undef hash_isempty
+int hash_isempty(hash_t *hash)
+{
+ return hash->nodecount == 0;
+}
+
+static hnode_t *hnode_alloc(void *context)
+{
+ return malloc(sizeof *hnode_alloc(NULL));
+}
+
+static void hnode_free(hnode_t *node, void *context)
+{
+ free(node);
+}
+
+
+/*
+ * Create a hash table node dynamically and assign it the given data.
+ */
+
+hnode_t *hnode_create(void *data)
+{
+ hnode_t *node = malloc(sizeof *node);
+ if (node) {
+ node->data = data;
+ node->next = NULL;
+ }
+ return node;
+}
+
+/*
+ * Initialize a client-supplied node
+ */
+
+hnode_t *hnode_init(hnode_t *hnode, void *data)
+{
+ hnode->data = data;
+ hnode->next = NULL;
+ return hnode;
+}
+
+/*
+ * Destroy a dynamically allocated node.
+ */
+
+void hnode_destroy(hnode_t *hnode)
+{
+ free(hnode);
+}
+
+#undef hnode_put
+void hnode_put(hnode_t *node, void *data)
+{
+ node->data = data;
+}
+
+#undef hnode_get
+void *hnode_get(hnode_t *node)
+{
+ return node->data;
+}
+
+#undef hnode_getkey
+const void *hnode_getkey(hnode_t *node)
+{
+ return node->key;
+}
+
+#undef hash_count
+hashcount_t hash_count(hash_t *hash)
+{
+ return hash->nodecount;
+}
+
+#undef hash_size
+hashcount_t hash_size(hash_t *hash)
+{
+ return hash->nchains;
+}
+
+static hash_val_t hash_fun_default(const void *key)
+{
+ static unsigned long randbox[] = {
+ 0x49848f1bU, 0xe6255dbaU, 0x36da5bdcU, 0x47bf94e9U,
+ 0x8cbcce22U, 0x559fc06aU, 0xd268f536U, 0xe10af79aU,
+ 0xc1af4d69U, 0x1d2917b5U, 0xec4c304dU, 0x9ee5016cU,
+ 0x69232f74U, 0xfead7bb3U, 0xe9089ab6U, 0xf012f6aeU,
+ };
+
+ const unsigned char *str = key;
+ hash_val_t acc = 0;
+
+ while (*str) {
+ acc ^= randbox[(*str + acc) & 0xf];
+ acc = (acc << 1) | (acc >> 31);
+ acc &= 0xffffffffU;
+ acc ^= randbox[((*str++ >> 4) + acc) & 0xf];
+ acc = (acc << 2) | (acc >> 30);
+ acc &= 0xffffffffU;
+ }
+ return acc;
+}
+
+static int hash_comp_default(const void *key1, const void *key2)
+{
+ return strcmp(key1, key2);
+}
diff --git a/libutil/kazlib/hash.h b/libutil/kazlib/hash.h
new file mode 100644
index 0000000..e8213f7
--- /dev/null
+++ b/libutil/kazlib/hash.h
@@ -0,0 +1,238 @@
+/*
+ * Hash Table Data Type
+ * Copyright (C) 1997 Kaz Kylheku <kaz at ashi.footprints.net>
+ *
+ * Free Software License:
+ *
+ * All rights are reserved by the author, with the following exceptions:
+ * Permission is granted to freely reproduce and distribute this software,
+ * possibly in exchange for a fee, provided that this copyright notice appears
+ * intact. Permission is also granted to adapt this software to produce
+ * derivative works, as long as the modified versions carry this copyright
+ * notice and additional notices stating that the work has been modified.
+ * This source code may be translated into executable form and incorporated
+ * into proprietary software; there is no requirement for such software to
+ * contain a copyright notice related to this source.
+ *
+ */
+
+#ifndef HASH_H
+#define HASH_H
+
+#include <limits.h>
+#ifdef KAZLIB_SIDEEFFECT_DEBUG
+#include "sfx.h"
+#endif
+
+/*
+ * Blurb for inclusion into C++ translation units
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned long hashcount_t;
+#define HASHCOUNT_T_MAX ULONG_MAX
+
+typedef unsigned long hash_val_t;
+#define HASH_VAL_T_MAX ULONG_MAX
+
+extern int hash_val_t_bit;
+
+#ifndef HASH_VAL_T_BIT
+#define HASH_VAL_T_BIT ((int) hash_val_t_bit)
+#endif
+
+/*
+ * Hash chain node structure.
+ * Notes:
+ * 1. This preprocessing directive is for debugging purposes. The effect is
+ * that if the preprocessor symbol KAZLIB_OPAQUE_DEBUG is defined prior to the
+ * inclusion of this header, then the structure shall be declared as having
+ * the single member int __OPAQUE__. This way, any attempts by the
+ * client code to violate the principles of information hiding (by accessing
+ * the structure directly) can be diagnosed at translation time. However,
+ * note the resulting compiled unit is not suitable for linking.
+ * 2. This is a pointer to the next node in the chain. In the last node of a
+ * chain, this pointer is null.
+ * 3. The key is a pointer to some user supplied data that contains a unique
+ * identifier for each hash node in a given table. The interpretation of
+ * the data is up to the user. When creating or initializing a hash table,
+ * the user must supply a pointer to a function for comparing two keys,
+ * and a pointer to a function for hashing a key into a numeric value.
+ * 4. The value is a user-supplied pointer to void which may refer to
+ * any data object. It is not interpreted in any way by the hashing
+ * module.
+ * 5. The hashed key is stored in each node so that we don't have to rehash
+ * each key when the table must grow or shrink.
+ */
+
+typedef struct hnode_t {
+#if defined(HASH_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG) /* 1 */
+ struct hnode_t *hash_next; /* 2 */
+ const void *hash_key; /* 3 */
+ void *hash_data; /* 4 */
+ hash_val_t hash_hkey; /* 5 */
+#else
+ int hash_dummy;
+#endif
+} hnode_t;
+
+/*
+ * The comparison function pointer type. A comparison function takes two keys
+ * and produces a value of -1 if the left key is less than the right key, a
+ * value of 0 if the keys are equal, and a value of 1 if the left key is
+ * greater than the right key.
+ */
+
+typedef int (*hash_comp_t)(const void *, const void *);
+
+/*
+ * The hashing function performs some computation on a key and produces an
+ * integral value of type hash_val_t based on that key. For best results, the
+ * function should have a good randomness properties in *all* significant bits
+ * over the set of keys that are being inserted into a given hash table. In
+ * particular, the most significant bits of hash_val_t are most significant to
+ * the hash module. Only as the hash table expands are less significant bits
+ * examined. Thus a function that has good distribution in its upper bits but
+ * not lower is preferrable to one that has poor distribution in the upper bits
+ * but not the lower ones.
+ */
+
+typedef hash_val_t (*hash_fun_t)(const void *);
+
+/*
+ * allocator functions
+ */
+
+typedef hnode_t *(*hnode_alloc_t)(void *);
+typedef void (*hnode_free_t)(hnode_t *, void *);
+
+/*
+ * This is the hash table control structure. It keeps track of information
+ * about a hash table, as well as the hash table itself.
+ * Notes:
+ * 1. Pointer to the hash table proper. The table is an array of pointers to
+ * hash nodes (of type hnode_t). If the table is empty, every element of
+ * this table is a null pointer. A non-null entry points to the first
+ * element of a chain of nodes.
+ * 2. This member keeps track of the size of the hash table---that is, the
+ * number of chain pointers.
+ * 3. The count member maintains the number of elements that are presently
+ * in the hash table.
+ * 4. The maximum count is the greatest number of nodes that can populate this
+ * table. If the table contains this many nodes, no more can be inserted,
+ * and the hash_isfull() function returns true.
+ * 5. The high mark is a population threshold, measured as a number of nodes,
+ * which, if exceeded, will trigger a table expansion. Only dynamic hash
+ * tables are subject to this expansion.
+ * 6. The low mark is a minimum population threshold, measured as a number of
+ * nodes. If the table population drops below this value, a table shrinkage
+ * will occur. Only dynamic tables are subject to this reduction. No table
+ * will shrink beneath a certain absolute minimum number of nodes.
+ * 7. This is the a pointer to the hash table's comparison function. The
+ * function is set once at initialization or creation time.
+ * 8. Pointer to the table's hashing function, set once at creation or
+ * initialization time.
+ * 9. The current hash table mask. If the size of the hash table is 2^N,
+ * this value has its low N bits set to 1, and the others clear. It is used
+ * to select bits from the result of the hashing function to compute an
+ * index into the table.
+ * 10. A flag which indicates whether the table is to be dynamically resized. It
+ * is set to 1 in dynamically allocated tables, 0 in tables that are
+ * statically allocated.
+ */
+
+typedef struct hash_t {
+#if defined(HASH_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+ struct hnode_t **hash_table; /* 1 */
+ hashcount_t hash_nchains; /* 2 */
+ hashcount_t hash_nodecount; /* 3 */
+ hashcount_t hash_maxcount; /* 4 */
+ hashcount_t hash_highmark; /* 5 */
+ hashcount_t hash_lowmark; /* 6 */
+ hash_comp_t hash_compare; /* 7 */
+ hash_fun_t hash_function; /* 8 */
+ hnode_alloc_t hash_allocnode;
+ hnode_free_t hash_freenode;
+ void *hash_context;
+ hash_val_t hash_mask; /* 9 */
+ int hash_dynamic; /* 10 */
+#else
+ int hash_dummy;
+#endif
+} hash_t;
+
+/*
+ * Hash scanner structure, used for traversals of the data structure.
+ * Notes:
+ * 1. Pointer to the hash table that is being traversed.
+ * 2. Reference to the current chain in the table being traversed (the chain
+ * that contains the next node that shall be retrieved).
+ * 3. Pointer to the node that will be retrieved by the subsequent call to
+ * hash_scan_next().
+ */
+
+typedef struct hscan_t {
+#if defined(HASH_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+ hash_t *hash_table; /* 1 */
+ hash_val_t hash_chain; /* 2 */
+ hnode_t *hash_next; /* 3 */
+#else
+ int hash_dummy;
+#endif
+} hscan_t;
+
+extern hash_t *hash_create(hashcount_t, hash_comp_t, hash_fun_t);
+extern void hash_set_allocator(hash_t *, hnode_alloc_t, hnode_free_t, void *);
+extern void hash_destroy(hash_t *);
+extern void hash_free_nodes(hash_t *);
+extern void hash_free(hash_t *);
+extern hash_t *hash_init(hash_t *, hashcount_t, hash_comp_t,
+ hash_fun_t, hnode_t **, hashcount_t);
+extern void hash_insert(hash_t *, hnode_t *, const void *);
+extern hnode_t *hash_lookup(hash_t *, const void *);
+extern hnode_t *hash_delete(hash_t *, hnode_t *);
+extern int hash_alloc_insert(hash_t *, const void *, void *);
+extern void hash_delete_free(hash_t *, hnode_t *);
+
+extern void hnode_put(hnode_t *, void *);
+extern void *hnode_get(hnode_t *);
+extern const void *hnode_getkey(hnode_t *);
+extern hashcount_t hash_count(hash_t *);
+extern hashcount_t hash_size(hash_t *);
+
+extern int hash_isfull(hash_t *);
+extern int hash_isempty(hash_t *);
+
+extern void hash_scan_begin(hscan_t *, hash_t *);
+extern hnode_t *hash_scan_next(hscan_t *);
+extern hnode_t *hash_scan_delete(hash_t *, hnode_t *);
+extern void hash_scan_delfree(hash_t *, hnode_t *);
+
+extern int hash_verify(hash_t *);
+
+extern hnode_t *hnode_create(void *);
+extern hnode_t *hnode_init(hnode_t *, void *);
+extern void hnode_destroy(hnode_t *);
+
+#if defined(HASH_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+#ifdef KAZLIB_SIDEEFFECT_DEBUG
+#define hash_isfull(H) (SFX_CHECK(H)->hash_nodecount == (H)->hash_maxcount)
+#else
+#define hash_isfull(H) ((H)->hash_nodecount == (H)->hash_maxcount)
+#endif
+#define hash_isempty(H) ((H)->hash_nodecount == 0)
+#define hash_count(H) ((H)->hash_nodecount)
+#define hash_size(H) ((H)->hash_nchains)
+#define hnode_get(N) ((N)->hash_data)
+#define hnode_getkey(N) ((N)->hash_key)
+#define hnode_put(N, V) ((N)->hash_data = (V))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libutil/kazlib/list.c b/libutil/kazlib/list.c
new file mode 100644
index 0000000..818b427
--- /dev/null
+++ b/libutil/kazlib/list.c
@@ -0,0 +1,766 @@
+/*
+ * List Abstract Data Type
+ * Copyright (C) 1997 Kaz Kylheku <kaz at ashi.footprints.net>
+ *
+ * Free Software License:
+ *
+ * All rights are reserved by the author, with the following exceptions:
+ * Permission is granted to freely reproduce and distribute this software,
+ * possibly in exchange for a fee, provided that this copyright notice appears
+ * intact. Permission is also granted to adapt this software to produce
+ * derivative works, as long as the modified versions carry this copyright
+ * notice and additional notices stating that the work has been modified.
+ * This source code may be translated into executable form and incorporated
+ * into proprietary software; there is no requirement for such software to
+ * contain a copyright notice related to this source.
+ *
+ */
+
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <assert.h>
+#define LIST_IMPLEMENTATION
+#include "list.h"
+
+#define next list_next
+#define prev list_prev
+#define data list_data
+
+#define pool list_pool
+#define fre list_free
+#define size list_size
+
+#define nilnode list_nilnode
+#define nodecount list_nodecount
+#define maxcount list_maxcount
+
+#define list_nil(L) (&(L)->nilnode)
+#define list_first_priv(L) ((L)->nilnode.next)
+#define list_last_priv(L) ((L)->nilnode.prev)
+#define lnode_next(N) ((N)->next)
+#define lnode_prev(N) ((N)->prev)
+
+/*
+ * Initialize a list object supplied by the client such that it becomes a valid
+ * empty list. If the list is to be ``unbounded'', the maxcount should be
+ * specified as LISTCOUNT_T_MAX, or, alternately, as -1. The value zero
+ * is not permitted.
+ */
+
+list_t *list_init(list_t *list, listcount_t maxcount)
+{
+ assert (maxcount != 0);
+ list->nilnode.next = &list->nilnode;
+ list->nilnode.prev = &list->nilnode;
+ list->nodecount = 0;
+ list->maxcount = maxcount;
+ return list;
+}
+
+/*
+ * Dynamically allocate a list object using malloc(), and initialize it so that
+ * it is a valid empty list. If the list is to be ``unbounded'', the maxcount
+ * should be specified as LISTCOUNT_T_MAX, or, alternately, as -1.
+ */
+
+list_t *list_create(listcount_t maxcount)
+{
+ list_t *new = malloc(sizeof *new);
+ if (new) {
+ assert (maxcount != 0);
+ new->nilnode.next = &new->nilnode;
+ new->nilnode.prev = &new->nilnode;
+ new->nodecount = 0;
+ new->maxcount = maxcount;
+ }
+ return new;
+}
+
+/*
+ * Destroy a dynamically allocated list object.
+ * The client must remove the nodes first.
+ */
+
+void list_destroy(list_t *list)
+{
+ assert (list_isempty(list));
+ free(list);
+}
+
+/*
+ * Free all of the nodes of a list. The list must contain only
+ * dynamically allocated nodes. After this call, the list
+ * is empty.
+ */
+
+void list_destroy_nodes(list_t *list)
+{
+ lnode_t *lnode = list_first_priv(list), *nil = list_nil(list), *tmp;
+
+ while (lnode != nil) {
+ tmp = lnode->next;
+ lnode->next = NULL;
+ lnode->prev = NULL;
+ lnode_destroy(lnode);
+ lnode = tmp;
+ }
+
+ list_init(list, list->maxcount);
+}
+
+/*
+ * Return all of the nodes of a list to a node pool. The nodes in
+ * the list must all have come from the same pool.
+ */
+
+void list_return_nodes(list_t *list, lnodepool_t *pool)
+{
+ lnode_t *lnode = list_first_priv(list), *tmp, *nil = list_nil(list);
+
+ while (lnode != nil) {
+ tmp = lnode->next;
+ lnode->next = NULL;
+ lnode->prev = NULL;
+ lnode_return(pool, lnode);
+ lnode = tmp;
+ }
+
+ list_init(list, list->maxcount);
+}
+
+/*
+ * Insert the node ``new'' into the list immediately after ``this'' node.
+ */
+
+void list_ins_after(list_t *list, lnode_t *new, lnode_t *this)
+{
+ lnode_t *that = this->next;
+
+ assert (new != NULL);
+ assert (!list_contains(list, new));
+ assert (!lnode_is_in_a_list(new));
+ assert (this == list_nil(list) || list_contains(list, this));
+ assert (list->nodecount + 1 > list->nodecount);
+
+ new->prev = this;
+ new->next = that;
+ that->prev = new;
+ this->next = new;
+ list->nodecount++;
+
+ assert (list->nodecount <= list->maxcount);
+}
+
+/*
+ * Insert the node ``new'' into the list immediately before ``this'' node.
+ */
+
+void list_ins_before(list_t *list, lnode_t *new, lnode_t *this)
+{
+ lnode_t *that = this->prev;
+
+ assert (new != NULL);
+ assert (!list_contains(list, new));
+ assert (!lnode_is_in_a_list(new));
+ assert (this == list_nil(list) || list_contains(list, this));
+ assert (list->nodecount + 1 > list->nodecount);
+
+ new->next = this;
+ new->prev = that;
+ that->next = new;
+ this->prev = new;
+ list->nodecount++;
+
+ assert (list->nodecount <= list->maxcount);
+}
+
+/*
+ * Delete the given node from the list.
+ */
+
+lnode_t *list_delete(list_t *list, lnode_t *del)
+{
+ lnode_t *next = del->next;
+ lnode_t *prev = del->prev;
+
+ assert (list_contains(list, del));
+
+ prev->next = next;
+ next->prev = prev;
+ list->nodecount--;
+
+ del->next = del->prev = NULL;
+
+ return del;
+}
+
+/*
+ * For each node in the list, execute the given function. The list,
+ * current node and the given context pointer are passed on each
+ * call to the function.
+ */
+
+void list_process(list_t *list, void *context,
+ void (* function)(list_t *list, lnode_t *lnode, void *context))
+{
+ lnode_t *node = list_first_priv(list), *next, *nil = list_nil(list);
+
+ while (node != nil) {
+ /* check for callback function deleting */
+ /* the next node from under us */
+ assert (list_contains(list, node));
+ next = node->next;
+ function(list, node, context);
+ node = next;
+ }
+}
+
+/*
+ * Dynamically allocate a list node and assign it the given piece of data.
+ */
+
+lnode_t *lnode_create(void *data)
+{
+ lnode_t *new = malloc(sizeof *new);
+ if (new) {
+ new->data = data;
+ new->next = NULL;
+ new->prev = NULL;
+ }
+ return new;
+}
+
+/*
+ * Initialize a user-supplied lnode.
+ */
+
+lnode_t *lnode_init(lnode_t *lnode, void *data)
+{
+ lnode->data = data;
+ lnode->next = NULL;
+ lnode->prev = NULL;
+ return lnode;
+}
+
+/*
+ * Destroy a dynamically allocated node.
+ */
+
+void lnode_destroy(lnode_t *lnode)
+{
+ assert (!lnode_is_in_a_list(lnode));
+ free(lnode);
+}
+
+/*
+ * Initialize a node pool object to use a user-supplied set of nodes.
+ * The ``nodes'' pointer refers to an array of lnode_t objects, containing
+ * ``n'' elements.
+ */
+
+lnodepool_t *lnode_pool_init(lnodepool_t *pool, lnode_t *nodes, listcount_t n)
+{
+ listcount_t i;
+
+ assert (n != 0);
+
+ pool->pool = nodes;
+ pool->fre = nodes;
+ pool->size = n;
+ for (i = 0; i < n - 1; i++) {
+ nodes[i].next = nodes + i + 1;
+ }
+ nodes[i].next = NULL;
+ nodes[i].prev = nodes; /* to make sure node is marked ``on list'' */
+ return pool;
+}
+
+/*
+ * Create a dynamically allocated pool of n nodes.
+ */
+
+lnodepool_t *lnode_pool_create(listcount_t n)
+{
+ lnodepool_t *pool;
+ lnode_t *nodes;
+
+ assert (n != 0);
+
+ pool = malloc(sizeof *pool);
+ if (!pool)
+ return NULL;
+ nodes = malloc(n * sizeof *nodes);
+ if (!nodes) {
+ free(pool);
+ return NULL;
+ }
+ lnode_pool_init(pool, nodes, n);
+ return pool;
+}
+
+/*
+ * Determine whether the given pool is from this pool.
+ */
+
+int lnode_pool_isfrom(lnodepool_t *pool, lnode_t *node)
+{
+ listcount_t i;
+
+ /* this is carefully coded this way because ANSI C forbids pointers
+ to different objects from being subtracted or compared other
+ than for exact equality */
+
+ for (i = 0; i < pool->size; i++) {
+ if (pool->pool + i == node)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Destroy a dynamically allocated pool of nodes.
+ */
+
+void lnode_pool_destroy(lnodepool_t *p)
+{
+ free(p->pool);
+ free(p);
+}
+
+/*
+ * Borrow a node from a node pool. Returns a null pointer if the pool
+ * is exhausted.
+ */
+
+lnode_t *lnode_borrow(lnodepool_t *pool, void *data)
+{
+ lnode_t *new = pool->fre;
+ if (new) {
+ pool->fre = new->next;
+ new->data = data;
+ new->next = NULL;
+ new->prev = NULL;
+ }
+ return new;
+}
+
+/*
+ * Return a node to a node pool. A node must be returned to the pool
+ * from which it came.
+ */
+
+void lnode_return(lnodepool_t *pool, lnode_t *node)
+{
+ assert (lnode_pool_isfrom(pool, node));
+ assert (!lnode_is_in_a_list(node));
+
+ node->next = pool->fre;
+ node->prev = node;
+ pool->fre = node;
+}
+
+/*
+ * Determine whether the given list contains the given node.
+ * According to this function, a list does not contain its nilnode.
+ */
+
+int list_contains(list_t *list, lnode_t *node)
+{
+ lnode_t *n, *nil = list_nil(list);
+
+ for (n = list_first_priv(list); n != nil; n = lnode_next(n)) {
+ if (node == n)
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * A more generalized variant of list_transfer. This one removes a
+ * ``slice'' from the source list and appends it to the destination
+ * list.
+ */
+
+void list_extract(list_t *dest, list_t *source, lnode_t *first, lnode_t *last)
+{
+ listcount_t moved = 1;
+
+ assert (first == NULL || list_contains(source, first));
+ assert (last == NULL || list_contains(source, last));
+
+ if (first == NULL || last == NULL)
+ return;
+
+ /* adjust the destination list so that the slice is spliced out */
+
+ first->prev->next = last->next;
+ last->next->prev = first->prev;
+
+ /* graft the splice at the end of the dest list */
+
+ last->next = &dest->nilnode;
+ first->prev = dest->nilnode.prev;
+ dest->nilnode.prev->next = first;
+ dest->nilnode.prev = last;
+
+ while (first != last) {
+ first = first->next;
+ assert (first != list_nil(source)); /* oops, last before first! */
+ moved++;
+ }
+
+ /* assert no overflows */
+ assert (source->nodecount - moved <= source->nodecount);
+ assert (dest->nodecount + moved >= dest->nodecount);
+
+ /* assert no weirdness */
+ assert (moved <= source->nodecount);
+
+ source->nodecount -= moved;
+ dest->nodecount += moved;
+
+ /* assert list sanity */
+ assert (list_verify(source));
+ assert (list_verify(dest));
+}
+
+
+/*
+ * Split off a trailing sequence of nodes from the source list and relocate
+ * them to the tail of the destination list. The trailing sequence begins
+ * with node ``first'' and terminates with the last node of the source
+ * list. The nodes are added to the end of the new list in their original
+ * order.
+ */
+
+void list_transfer(list_t *dest, list_t *source, lnode_t *first)
+{
+ listcount_t moved = 1;
+ lnode_t *last;
+
+ assert (first == NULL || list_contains(source, first));
+
+ if (first == NULL)
+ return;
+
+ last = source->nilnode.prev;
+
+ source->nilnode.prev = first->prev;
+ first->prev->next = &source->nilnode;
+
+ last->next = &dest->nilnode;
+ first->prev = dest->nilnode.prev;
+ dest->nilnode.prev->next = first;
+ dest->nilnode.prev = last;
+
+ while (first != last) {
+ first = first->next;
+ moved++;
+ }
+
+ /* assert no overflows */
+ assert (source->nodecount - moved <= source->nodecount);
+ assert (dest->nodecount + moved >= dest->nodecount);
+
+ /* assert no weirdness */
+ assert (moved <= source->nodecount);
+
+ source->nodecount -= moved;
+ dest->nodecount += moved;
+
+ /* assert list sanity */
+ assert (list_verify(source));
+ assert (list_verify(dest));
+}
+
+void list_merge(list_t *dest, list_t *sour,
+ int compare (const void *, const void *))
+{
+ lnode_t *dn, *sn, *tn;
+ lnode_t *d_nil = list_nil(dest), *s_nil = list_nil(sour);
+
+ /* Nothing to do if source and destination list are the same. */
+ if (dest == sour)
+ return;
+
+ /* overflow check */
+ assert (list_count(sour) + list_count(dest) >= list_count(sour));
+
+ /* lists must be sorted */
+ assert (list_is_sorted(sour, compare));
+ assert (list_is_sorted(dest, compare));
+
+ dn = list_first_priv(dest);
+ sn = list_first_priv(sour);
+
+ while (dn != d_nil && sn != s_nil) {
+ if (compare(lnode_get(dn), lnode_get(sn)) >= 0) {
+ tn = lnode_next(sn);
+ list_delete(sour, sn);
+ list_ins_before(dest, sn, dn);
+ sn = tn;
+ } else {
+ dn = lnode_next(dn);
+ }
+ }
+
+ if (dn != d_nil)
+ return;
+
+ if (sn != s_nil)
+ list_transfer(dest, sour, sn);
+}
+
+void list_sort(list_t *list, int compare(const void *, const void *))
+{
+ list_t extra;
+ listcount_t middle;
+ lnode_t *node;
+
+ if (list_count(list) > 1) {
+ middle = list_count(list) / 2;
+ node = list_first_priv(list);
+
+ list_init(&extra, list_count(list) - middle);
+
+ while (middle--)
+ node = lnode_next(node);
+
+ list_transfer(&extra, list, node);
+ list_sort(list, compare);
+ list_sort(&extra, compare);
+ list_merge(list, &extra, compare);
+ }
+ assert (list_is_sorted(list, compare));
+}
+
+lnode_t *list_find(list_t *list, const void *key, int compare(const void *, const void *))
+{
+ lnode_t *node;
+
+ for (node = list_first_priv(list); node != list_nil(list); node = node->next) {
+ if (compare(lnode_get(node), key) == 0)
+ return node;
+ }
+
+ return 0;
+}
+
+
+/*
+ * Return 1 if the list is in sorted order, 0 otherwise
+ */
+
+int list_is_sorted(list_t *list, int compare(const void *, const void *))
+{
+ lnode_t *node, *next, *nil;
+
+ next = nil = list_nil(list);
+ node = list_first_priv(list);
+
+ if (node != nil)
+ next = lnode_next(node);
+
+ for (; next != nil; node = next, next = lnode_next(next)) {
+ if (compare(lnode_get(node), lnode_get(next)) > 0)
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Get rid of macro functions definitions so they don't interfere
+ * with the actual definitions
+ */
+
+#undef list_isempty
+#undef list_isfull
+#undef lnode_pool_isempty
+#undef list_append
+#undef list_prepend
+#undef list_first
+#undef list_last
+#undef list_next
+#undef list_prev
+#undef list_count
+#undef list_del_first
+#undef list_del_last
+#undef lnode_put
+#undef lnode_get
+
+/*
+ * Return 1 if the list is empty, 0 otherwise
+ */
+
+int list_isempty(list_t *list)
+{
+ return list->nodecount == 0;
+}
+
+/*
+ * Return 1 if the list is full, 0 otherwise
+ * Permitted only on bounded lists.
+ */
+
+int list_isfull(list_t *list)
+{
+ return list->nodecount == list->maxcount;
+}
+
+/*
+ * Check if the node pool is empty.
+ */
+
+int lnode_pool_isempty(lnodepool_t *pool)
+{
+ return (pool->fre == NULL);
+}
+
+/*
+ * Add the given node at the end of the list
+ */
+
+void list_append(list_t *list, lnode_t *node)
+{
+ list_ins_before(list, node, &list->nilnode);
+}
+
+/*
+ * Add the given node at the beginning of the list.
+ */
+
+void list_prepend(list_t *list, lnode_t *node)
+{
+ list_ins_after(list, node, &list->nilnode);
+}
+
+/*
+ * Retrieve the first node of the list
+ */
+
+lnode_t *list_first(list_t *list)
+{
+ if (list->nilnode.next == &list->nilnode)
+ return NULL;
+ return list->nilnode.next;
+}
+
+/*
+ * Retrieve the last node of the list
+ */
+
+lnode_t *list_last(list_t *list)
+{
+ if (list->nilnode.prev == &list->nilnode)
+ return NULL;
+ return list->nilnode.prev;
+}
+
+/*
+ * Retrieve the count of nodes in the list
+ */
+
+listcount_t list_count(list_t *list)
+{
+ return list->nodecount;
+}
+
+/*
+ * Remove the first node from the list and return it.
+ */
+
+lnode_t *list_del_first(list_t *list)
+{
+ return list_delete(list, list->nilnode.next);
+}
+
+/*
+ * Remove the last node from the list and return it.
+ */
+
+lnode_t *list_del_last(list_t *list)
+{
+ return list_delete(list, list->nilnode.prev);
+}
+
+
+/*
+ * Associate a data item with the given node.
+ */
+
+void lnode_put(lnode_t *lnode, void *data)
+{
+ lnode->data = data;
+}
+
+/*
+ * Retrieve the data item associated with the node.
+ */
+
+void *lnode_get(lnode_t *lnode)
+{
+ return lnode->data;
+}
+
+/*
+ * Retrieve the node's successor. If there is no successor,
+ * NULL is returned.
+ */
+
+lnode_t *list_next(list_t *list, lnode_t *lnode)
+{
+ assert (list_contains(list, lnode));
+
+ if (lnode->next == list_nil(list))
+ return NULL;
+ return lnode->next;
+}
+
+/*
+ * Retrieve the node's predecessor. See comment for lnode_next().
+ */
+
+lnode_t *list_prev(list_t *list, lnode_t *lnode)
+{
+ assert (list_contains(list, lnode));
+
+ if (lnode->prev == list_nil(list))
+ return NULL;
+ return lnode->prev;
+}
+
+/*
+ * Return 1 if the lnode is in some list, otherwise return 0.
+ */
+
+int lnode_is_in_a_list(lnode_t *lnode)
+{
+ return (lnode->next != NULL || lnode->prev != NULL);
+}
+
+
+int list_verify(list_t *list)
+{
+ lnode_t *node = list_first_priv(list), *nil = list_nil(list);
+ listcount_t count = list_count(list);
+
+ if (node->prev != nil)
+ return 0;
+
+ if (count > list->maxcount)
+ return 0;
+
+ while (node != nil && count--) {
+ if (node->next->prev != node)
+ return 0;
+ node = node->next;
+ }
+
+ if (count != 0 || node != nil)
+ return 0;
+
+ return 1;
+}
diff --git a/libutil/kazlib/list.h b/libutil/kazlib/list.h
new file mode 100644
index 0000000..97abc2f
--- /dev/null
+++ b/libutil/kazlib/list.h
@@ -0,0 +1,152 @@
+/*
+ * List Abstract Data Type
+ * Copyright (C) 1997 Kaz Kylheku <kaz at ashi.footprints.net>
+ *
+ * Free Software License:
+ *
+ * All rights are reserved by the author, with the following exceptions:
+ * Permission is granted to freely reproduce and distribute this software,
+ * possibly in exchange for a fee, provided that this copyright notice appears
+ * intact. Permission is also granted to adapt this software to produce
+ * derivative works, as long as the modified versions carry this copyright
+ * notice and additional notices stating that the work has been modified.
+ * This source code may be translated into executable form and incorporated
+ * into proprietary software; there is no requirement for such software to
+ * contain a copyright notice related to this source.
+ *
+ */
+
+#ifndef LIST_H
+#define LIST_H
+
+#include <limits.h>
+
+#ifdef KAZLIB_SIDEEFFECT_DEBUG
+#include "sfx.h"
+#define LIST_SFX_CHECK(E) SFX_CHECK(E)
+#else
+#define LIST_SFX_CHECK(E) (E)
+#endif
+
+/*
+ * Blurb for inclusion into C++ translation units
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned long listcount_t;
+#define LISTCOUNT_T_MAX ULONG_MAX
+
+typedef struct lnode_t {
+#if defined(LIST_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+ struct lnode_t *list_next;
+ struct lnode_t *list_prev;
+ void *list_data;
+#else
+ int list_dummy;
+#endif
+} lnode_t;
+
+typedef struct lnodepool_t {
+#if defined(LIST_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+ struct lnode_t *list_pool;
+ struct lnode_t *list_free;
+ listcount_t list_size;
+#else
+ int list_dummy;
+#endif
+} lnodepool_t;
+
+typedef struct list_t {
+#if defined(LIST_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+ lnode_t list_nilnode;
+ listcount_t list_nodecount;
+ listcount_t list_maxcount;
+#else
+ int list_dummy;
+#endif
+} list_t;
+
+lnode_t *lnode_create(void *);
+lnode_t *lnode_init(lnode_t *, void *);
+void lnode_destroy(lnode_t *);
+void lnode_put(lnode_t *, void *);
+void *lnode_get(lnode_t *);
+int lnode_is_in_a_list(lnode_t *);
+
+#if defined(LIST_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+#define lnode_put(N, D) ((N)->list_data = (D))
+#define lnode_get(N) ((N)->list_data)
+#endif
+
+lnodepool_t *lnode_pool_init(lnodepool_t *, lnode_t *, listcount_t);
+lnodepool_t *lnode_pool_create(listcount_t);
+void lnode_pool_destroy(lnodepool_t *);
+lnode_t *lnode_borrow(lnodepool_t *, void *);
+void lnode_return(lnodepool_t *, lnode_t *);
+int lnode_pool_isempty(lnodepool_t *);
+int lnode_pool_isfrom(lnodepool_t *, lnode_t *);
+
+list_t *list_init(list_t *, listcount_t);
+list_t *list_create(listcount_t);
+void list_destroy(list_t *);
+void list_destroy_nodes(list_t *);
+void list_return_nodes(list_t *, lnodepool_t *);
+
+listcount_t list_count(list_t *);
+int list_isempty(list_t *);
+int list_isfull(list_t *);
+int list_contains(list_t *, lnode_t *);
+
+void list_append(list_t *, lnode_t *);
+void list_prepend(list_t *, lnode_t *);
+void list_ins_before(list_t *, lnode_t *, lnode_t *);
+void list_ins_after(list_t *, lnode_t *, lnode_t *);
+
+lnode_t *list_first(list_t *);
+lnode_t *list_last(list_t *);
+lnode_t *list_next(list_t *, lnode_t *);
+lnode_t *list_prev(list_t *, lnode_t *);
+
+lnode_t *list_del_first(list_t *);
+lnode_t *list_del_last(list_t *);
+lnode_t *list_delete(list_t *, lnode_t *);
+
+void list_process(list_t *, void *, void (*)(list_t *, lnode_t *, void *));
+
+int list_verify(list_t *);
+
+#if defined(LIST_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+#define lnode_pool_isempty(P) ((P)->list_free == 0)
+#define list_count(L) ((L)->list_nodecount)
+#define list_isempty(L) ((L)->list_nodecount == 0)
+#define list_isfull(L) (LIST_SFX_CHECK(L)->list_nodecount == (L)->list_maxcount)
+#define list_next(L, N) (LIST_SFX_CHECK(N)->list_next == &(L)->list_nilnode ? NULL : (N)->list_next)
+#define list_prev(L, N) (LIST_SFX_CHECK(N)->list_prev == &(L)->list_nilnode ? NULL : (N)->list_prev)
+#define list_first(L) list_next(LIST_SFX_CHECK(L), &(L)->list_nilnode)
+#define list_last(L) list_prev(LIST_SFX_CHECK(L), &(L)->list_nilnode)
+#endif
+
+#if defined(LIST_IMPLEMENTATION) || !defined(KAZLIB_OPAQUE_DEBUG)
+#define list_append(L, N) list_ins_before(LIST_SFX_CHECK(L), N, &(L)->list_nilnode)
+#define list_prepend(L, N) list_ins_after(LIST_SFX_CHECK(L), N, &(L)->list_nilnode)
+#define list_del_first(L) list_delete(LIST_SFX_CHECK(L), list_first(L))
+#define list_del_last(L) list_delete(LIST_SFX_CHECK(L), list_last(L))
+#endif
+
+/* destination list on the left, source on the right */
+
+void list_extract(list_t *, list_t *, lnode_t *, lnode_t *);
+void list_transfer(list_t *, list_t *, lnode_t *first);
+void list_merge(list_t *, list_t *, int (const void *, const void *));
+void list_sort(list_t *, int (const void *, const void *));
+lnode_t *list_find(list_t *, const void *, int (const void *, const void *));
+int list_is_sorted(list_t *, int (const void *, const void *));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libutil/kazlib/sfx.c b/libutil/kazlib/sfx.c
new file mode 100644
index 0000000..829a53d
--- /dev/null
+++ b/libutil/kazlib/sfx.c
@@ -0,0 +1,1138 @@
+/*
+ * SFX---A utility which tries to determine whether a given C expression
+ * is free of side effects. This can be used for verifying that macros which
+ * expand their arguments more than once are not being accidentally misused.
+ *
+ * Copyright (C) 1999 Kaz Kylheku <kaz at ashi.footprints.net>
+ *
+ * Free Software License:
+ *
+ * All rights are reserved by the author, with the following exceptions:
+ * Permission is granted to freely reproduce and distribute this software,
+ * possibly in exchange for a fee, provided that this copyright notice appears
+ * intact. Permission is also granted to adapt this software to produce
+ * derivative works, as long as the modified versions carry this copyright
+ * notice and additional notices stating that the work has been modified.
+ * This source code may be translated into executable form and incorporated
+ * into proprietary software; there is no requirement for such software to
+ * contain a copyright notice related to this source.
+ *
+ */
+
+#include <ctype.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include "except.h"
+#include "sfx.h"
+#include "hash.h"
+#ifdef KAZLIB_POSIX_THREADS
+#include <pthread.h>
+#endif
+
+/*
+ * Exceptions
+ */
+
+#define SFX_EX 0x34DB9C4A
+#define SFX_SYNERR 1
+
+/*
+ * Cache entry
+ */
+
+typedef struct {
+ hnode_t node;
+ const char *expr;
+ sfx_rating_t eff;
+} sfx_entry_t;
+
+/*
+ * Parsing context structure
+ */
+
+typedef struct {
+ const unsigned char *start;
+ const unsigned char *input;
+ size_t size;
+ sfx_rating_t eff;
+} context_t;
+
+/*
+ * Declarator type: abstract, concrete or both
+ */
+
+typedef enum {
+ decl_abstract, decl_concrete, decl_both
+} decl_t;
+
+static void init_context(context_t *ctx, const unsigned char *expr)
+{
+ ctx->input = ctx->start = expr;
+ ctx->size = strlen((const char *) expr) + 1;
+ ctx->eff = sfx_none;
+}
+
+static void assign_context(context_t *copy, context_t *orig)
+{
+ *copy = *orig;
+}
+
+static void set_effect(context_t *ctx, sfx_rating_t eff)
+{
+ assert (eff == sfx_none || eff == sfx_potential || eff == sfx_certain);
+
+ if (eff > ctx->eff)
+ ctx->eff = eff;
+}
+
+static void reset_effect(context_t *ctx)
+{
+ ctx->eff = sfx_none;
+}
+
+static sfx_rating_t get_effect(context_t *ctx)
+{
+ return ctx->eff;
+}
+
+static int skip_ws(context_t *expr)
+{
+ while (*expr->input != 0 && isspace(*expr->input))
+ expr->input++;
+
+ return (*expr->input == 0);
+}
+
+static int get_next(context_t *expr)
+{
+ int ret = *expr->input;
+ if (ret)
+ expr->input++;
+ return ret;
+}
+
+static int get_next_skip_ws(context_t *expr)
+{
+ if (!skip_ws(expr))
+ return *expr->input++;
+ return 0;
+}
+
+static const unsigned char *get_ptr(context_t *expr)
+{
+ return expr->input;
+}
+
+static void skip_n(context_t *ctx, size_t n)
+{
+ assert ((size_t) (ctx->input - ctx->start) <= ctx->size - n);
+ ctx->input += n;
+}
+
+static void put_back(context_t *expr, int ch)
+{
+ if (ch)
+ expr->input--;
+}
+
+static int peek_next(context_t *expr)
+{
+ return *expr->input;
+}
+
+static void syntax_error(void)
+{
+ except_throw(SFX_EX, SFX_SYNERR, "syntax_error");
+}
+
+static void match_hard(context_t *expr, int match)
+{
+ int ch = get_next(expr);
+ if (ch != match)
+ syntax_error();
+}
+
+static void chk_comma(context_t *);
+
+static void skip_ident(context_t *expr)
+{
+ int ch = get_next(expr);
+
+ if (!isalpha(ch) && ch != '_')
+ syntax_error();
+
+ do {
+ ch = get_next(expr);
+ } while (isalnum(ch) || ch == '_');
+
+ put_back(expr, ch);
+}
+
+static void skip_constant(context_t *expr)
+{
+ int ch = get_next(expr);
+
+ assert (isdigit(ch) || ch == '.');
+
+ do {
+ ch = get_next(expr);
+ if (ch == 'e' || ch == 'E') {
+ ch = get_next(expr);
+ if (ch == '+' || ch == '-') {
+ ch = get_next(expr);
+ if (!isdigit(ch))
+ syntax_error();
+ }
+ }
+ } while (ch != 0 && (isalnum(ch) || ch == '.'));
+
+ put_back(expr, ch);
+}
+
+static void skip_strlit(context_t *expr)
+{
+ int ch = get_next(expr);
+
+ assert (ch == '"');
+
+ do {
+ ch = get_next(expr);
+ if (ch == '\\') {
+ get_next(expr);
+ continue;
+ }
+ } while (ch != 0 && ch != '"');
+
+ if (ch != '"')
+ syntax_error();
+}
+
+static void skip_charlit(context_t *expr)
+{
+ int ch = get_next(expr);
+
+ assert (ch == '\'');
+
+ do {
+ ch = get_next(expr);
+ if (ch == '\\') {
+ get_next(expr);
+ continue;
+ }
+ } while (ch != 0 && ch != '\'');
+
+ if (ch != '\'')
+ syntax_error();
+}
+
+static void chk_spec_qual_list(context_t *expr)
+{
+ skip_ws(expr);
+ skip_ident(expr);
+
+ for (;;) {
+ int ch;
+
+ skip_ws(expr);
+ ch = peek_next(expr);
+
+ if (!isalpha(ch) && ch != '_')
+ break;
+
+ skip_ident(expr);
+ }
+}
+
+static int speculate(void (*chk_func)(context_t *), context_t *expr, context_t *copy, int nextchar)
+{
+ static const except_id_t catch[] = { { SFX_EX, XCEPT_CODE_ANY } };
+ except_t *ex;
+ volatile int result = 0;
+ assign_context(copy, expr);
+
+ except_try_push(catch, 1, &ex);
+
+ if (ex == 0) {
+ chk_func(copy);
+ if (nextchar) {
+ skip_ws(copy);
+ match_hard(copy, nextchar);
+ }
+ result = 1;
+ }
+
+ except_try_pop();
+
+ return result;
+}
+
+static void chk_pointer_opt(context_t *expr)
+{
+ for (;;) {
+ int ch = get_next_skip_ws(expr);
+
+ if (ch != '*') {
+ put_back(expr, ch);
+ break;
+ }
+
+ skip_ws(expr);
+
+ ch = peek_next(expr);
+
+ if (ch == '*')
+ continue;
+ if (!isalpha(ch) && ch != '_')
+ break;
+
+ skip_ident(expr);
+ }
+}
+
+static void chk_decl(context_t *, decl_t);
+
+static void chk_parm_decl(context_t *expr)
+{
+ chk_spec_qual_list(expr);
+ chk_decl(expr, decl_both);
+}
+
+static void chk_parm_type_list(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_parm_decl(expr);
+
+ ch = get_next_skip_ws(expr);
+
+ if (ch != ',') {
+ put_back(expr, ch);
+ break;
+ }
+
+ ch = get_next_skip_ws(expr);
+
+ if (ch == '.') {
+ match_hard(expr, '.');
+ match_hard(expr, '.');
+ break;
+ }
+
+ put_back(expr, ch);
+ }
+}
+
+static void chk_conditional(context_t *);
+
+static void chk_direct_decl(context_t *expr, decl_t type)
+{
+ for (;;) {
+ int ch = get_next_skip_ws(expr);
+
+ if (ch == '(') {
+ skip_ws(expr);
+ ch = peek_next(expr);
+ if (ch == '*' || ch == '(' || ch == '[')
+ chk_decl(expr, type);
+ else if (isalpha(ch) || ch == '_')
+ chk_parm_type_list(expr);
+ match_hard(expr, ')');
+ } else if (ch == '[') {
+ skip_ws(expr);
+ ch = peek_next(expr);
+ if (ch != ']')
+ chk_conditional(expr);
+ match_hard(expr, ']');
+ } else if ((type == decl_concrete || type == decl_both) && (isalpha(ch) || ch == '_')) {
+ put_back(expr, ch);
+ skip_ident(expr);
+ break;
+ } else {
+ put_back(expr, ch);
+ break;
+ }
+ }
+}
+
+static void chk_decl(context_t *expr, decl_t type)
+{
+ int ch;
+ chk_pointer_opt(expr);
+ skip_ws(expr);
+ ch = peek_next(expr);
+ if (ch == '[' || ch == '(' || ((type == decl_concrete || type == decl_both) && (isalpha(ch) || ch == '_'))) {
+ chk_direct_decl(expr, type);
+ }
+}
+
+static void chk_typename(context_t *expr)
+{
+ chk_spec_qual_list(expr);
+ chk_decl(expr, decl_abstract);
+}
+
+static void chk_primary(context_t *expr)
+{
+ int ch = peek_next(expr);
+
+ if (ch == 'L') {
+ get_next(expr);
+ ch = peek_next(expr);
+
+ if (ch == '\'') {
+ skip_charlit(expr);
+ return;
+ }
+
+ if (ch == '"') {
+ skip_strlit(expr);
+ return;
+ }
+
+ put_back(expr, 'L');
+ ch = 'L';
+ }
+
+ if (isalpha(ch) || ch == '_') {
+ skip_ident(expr);
+ return;
+ }
+
+ if (isdigit(ch) || ch == '.') {
+ skip_constant(expr);
+ return;
+ }
+
+ if (ch == '(') {
+ get_next(expr);
+ chk_comma(expr);
+ match_hard(expr, ')');
+ return;
+ }
+
+ if (ch == '\'') {
+ skip_charlit(expr);
+ return;
+ }
+
+ if (ch == '"') {
+ skip_strlit(expr);
+ return;
+ }
+
+ syntax_error();
+}
+
+static void chk_postfix(context_t *expr)
+{
+ chk_primary(expr);
+
+ for (;;) {
+ int ch = get_next_skip_ws(expr);
+
+ switch (ch) {
+ case '[':
+ chk_comma(expr);
+ skip_ws(expr);
+ match_hard(expr, ']');
+ continue;
+ case '(':
+ set_effect(expr, sfx_potential);
+ ch = get_next_skip_ws(expr);
+
+ if (ch != ')') {
+ put_back(expr, ch);
+ /* clever hack: parse non-empty argument list as comma expression */
+ chk_comma(expr);
+ ch = get_next_skip_ws(expr);
+ }
+
+ if (ch != ')')
+ syntax_error();
+
+ continue;
+ case '.':
+ skip_ws(expr);
+ skip_ident(expr);
+ continue;
+ case '-':
+ ch = get_next(expr);
+
+ if (ch != '-' && ch != '>') {
+ put_back(expr, ch);
+ put_back(expr, '-');
+ break;
+ }
+
+ if (ch == '>') {
+ skip_ws(expr);
+ skip_ident(expr);
+ continue;
+ }
+
+ set_effect(expr, sfx_certain);
+ continue;
+ case '+':
+ ch = get_next(expr);
+ if (ch != '+') {
+ put_back(expr, ch);
+ put_back(expr, '+');
+ break;
+ }
+
+ set_effect(expr, sfx_certain);
+ continue;
+ default:
+ put_back(expr, ch);
+ break;
+ }
+ break;
+ }
+}
+
+static void chk_cast(context_t *);
+
+static void chk_unary(context_t *expr)
+{
+ for (;;) {
+ int nscan, ch = get_next_skip_ws(expr);
+
+ switch (ch) {
+ case '+':
+ ch = get_next(expr);
+ if (ch == '+')
+ set_effect(expr, sfx_certain);
+ else
+ put_back(expr, ch);
+ chk_cast(expr);
+ break;
+ case '-':
+ ch = get_next(expr);
+ if (ch == '-')
+ set_effect(expr, sfx_certain);
+ else
+ put_back(expr, ch);
+ chk_cast(expr);
+ break;
+ case '&': case '*': case '~': case '!':
+ chk_cast(expr);
+ break;
+ case 's':
+ put_back(expr, ch);
+ nscan = 0;
+ sscanf((const char *) get_ptr(expr), "sizeof%*1[^a-z0-9_]%n", &nscan);
+
+ if (nscan == 7 || strcmp((const char *) get_ptr(expr), "sizeof") == 0) {
+ sfx_rating_t eff = get_effect(expr);
+
+ skip_n(expr, 6);
+
+ ch = get_next_skip_ws(expr);
+
+ if (ch == '(') {
+ context_t comma, type;
+ int iscomma = speculate(chk_comma, expr, &comma, ')');
+ int istype = speculate(chk_typename, expr, &type, ')');
+
+ if (!iscomma && !istype)
+ syntax_error();
+
+ if (iscomma) {
+ context_t unary;
+ put_back(expr, ch);
+ if (speculate(chk_unary, expr, &unary, 0)) {
+ assign_context(expr, &unary);
+ istype = 0;
+ }
+ }
+
+ if (istype)
+ assign_context(expr, &type);
+ } else {
+ put_back(expr, ch);
+ chk_unary(expr);
+ }
+
+ reset_effect(expr);
+ set_effect(expr, eff);
+ break;
+ }
+ chk_postfix(expr);
+ break;
+ default:
+ put_back(expr, ch);
+ chk_postfix(expr);
+ break;
+ }
+
+ break;
+ }
+}
+
+static void chk_cast(context_t *expr)
+{
+ enum {
+ parexpr, /* parenthesized expression */
+ partype, /* parenthesized type name */
+ parambig, /* ambiguity between paren expr and paren type name */
+ unary, /* unary expression */
+ plunary, /* unary expression with leading plus or minus */
+ other /* none of the above, or even end of input */
+ } curr = partype, old = partype, peek = partype;
+
+ /* history for backtracking: two cast expression elements back */
+ context_t old_expr = { 0 }, cur_expr = { 0 };
+
+ for (;;) {
+ context_t type, comma, unr;
+ int ch = get_next_skip_ws(expr);
+
+ /*
+ * Determine what the next bit of input is: parenthesized type name,
+ * expression, unary expression or what? Speculative parsing is used
+ * to test several hypotheses. For example, something like
+ * (X)(Y) ^ 1 is seen, it will be turned, by subsequent iterations of
+ * this loop, into the codes: parambig, parambig, other.
+ */
+
+ if (ch == '(') {
+ int istype = speculate(chk_typename, expr, &type, ')');
+ int iscomma = speculate(chk_comma, expr, &comma, ')');
+
+ switch (istype << 1 | iscomma) {
+ case 0:
+ ch = get_next_skip_ws(expr);
+ if (ch == ')')
+ peek = other; /* empty parentheses */
+ else
+ syntax_error();
+ break;
+ case 1:
+ peek = parexpr;
+ break;
+ case 2:
+ peek = partype;
+ break;
+ case 3:
+ peek = parambig;
+ break;
+ }
+ put_back(expr, ch);
+ } else if (ch == 0) {
+ peek = other;
+ } else {
+ put_back(expr, ch);
+ if (speculate(chk_unary, expr, &unr, 0)) {
+ peek = (ch == '+' || ch == '-' || ch == '*' || ch == '&') ? plunary : unary;
+ } else {
+ peek = other;
+ }
+ }
+
+ /*
+ * Okay, now we have an idea what is coming in the input. We make some
+ * sensible decision based on this and the thing we parsed previously.
+ * Either the parsing continues to grab more parenthesized things, or
+ * some decision is made to parse out the suffix material sensibly and
+ * terminate. Backtracking is used up to two elements back. For
+ * example in the case of (X)(Y) ^ 1 (parambig, parambig, other) it's
+ * necessary, upon seeing ^ 1 (other) to go back to second to last
+ * ambigous parenthesized element (X) and terminate by parsing the
+ * (X)(Y) as a postfix expression. It cannot be a cast, because ^1
+ * isn't an expression. Unary expressions that start with + or -
+ * create an interesting ambiguity. Is (X)(Y) + 1 the addition of 1 to
+ * the result of the call to function X with parameter Y? Or is it the
+ * unary expression + 1 cast to type Y and X? The safer assumption is
+ * to go with the function call hypothesis, since that's the
+ * interpretation that may have side effects.
+ */
+
+ switch (curr) {
+ case parexpr: /* impossible cases */
+ case other:
+ case unary:
+ case plunary:
+ assert (0);
+ syntax_error();
+ /* notreached */
+ case partype:
+ switch (peek) {
+ case parexpr: /* cast in front of parenthesized expression */
+ chk_postfix(expr);
+ return;
+ case partype: /* compounding cast: keep looping */
+ break;
+ case parambig: /* type or expr: keep looping */
+ break;
+ case unary:
+ case plunary:
+ chk_unary(expr);
+ return;
+ case other: /* cast in front of non-expression! */
+ syntax_error();
+ /* notreached */
+ }
+ break;
+ case parambig:
+ switch (peek) {
+ case parexpr: /* function call */
+ assign_context(expr, &cur_expr);
+ chk_postfix(expr);
+ return;
+ case partype: /* compounding cast: keep looping */
+ break;
+ case parambig: /* type or expr: keep looping */
+ break;
+ case unary:
+ chk_unary(expr);
+ return;
+ case plunary: /* treat unary expr with + or - as additive */
+ case other:
+ if (old == parambig) {
+ /* reparse two expression-like things in a row as call */
+ assign_context(expr, &old_expr);
+ chk_postfix(expr);
+ return;
+ }
+ /* reparse expression followed by non-parenthesized
+ stuff as postfix expression */
+ assign_context(expr, &cur_expr);
+ chk_postfix(expr);
+ return; /* need more context */
+ }
+ break;
+ }
+
+ old = curr;
+ curr = peek;
+ assign_context(&old_expr, &cur_expr);
+ assign_context(&cur_expr, expr);
+ assign_context(expr, &type);
+ }
+}
+
+static void chk_multiplicative(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_cast(expr);
+ ch = get_next_skip_ws(expr);
+
+ if ((ch != '*' && ch != '/' && ch != '%') || peek_next(expr) == '=') {
+ put_back(expr, ch);
+ break;
+ }
+ }
+}
+
+static void chk_additive(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_multiplicative(expr);
+ ch = get_next_skip_ws(expr);
+
+ if ((ch != '+' && ch != '-') || peek_next(expr) == '=') {
+ put_back(expr, ch);
+ break;
+ }
+ }
+}
+
+static void chk_shift(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_additive(expr);
+ ch = get_next_skip_ws(expr);
+
+ if (ch != '<' && ch != '>') {
+ put_back(expr, ch);
+ break;
+ }
+
+ if (ch == '<' && peek_next(expr) != '<') {
+ put_back(expr, ch);
+ break;
+ }
+
+ if (ch == '>' && peek_next(expr) != '>') {
+ put_back(expr, ch);
+ break;
+ }
+
+ get_next(expr);
+
+ if (peek_next(expr) == '=') {
+ put_back(expr, ch);
+ put_back(expr, ch);
+ break;
+ }
+ }
+}
+
+static void chk_relational(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_shift(expr);
+ ch = get_next_skip_ws(expr);
+
+
+ if (ch != '<' && ch != '>') {
+ put_back(expr, ch);
+ break;
+ }
+
+ if (ch == '<' && peek_next(expr) == '<') {
+ put_back(expr, ch);
+ break;
+ }
+
+ if (ch == '>' && peek_next(expr) == '>') {
+ put_back(expr, ch);
+ break;
+ }
+
+ if (peek_next(expr) == '=')
+ get_next(expr);
+ }
+}
+
+static void chk_equality(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_relational(expr);
+ ch = get_next_skip_ws(expr);
+
+ if ((ch != '!' && ch != '=') || peek_next(expr) != '=') {
+ put_back(expr, ch);
+ break;
+ }
+
+ match_hard(expr, '=');
+ }
+}
+
+static void chk_and(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_equality(expr);
+ ch = get_next_skip_ws(expr);
+
+ if (ch != '&' || peek_next(expr) == '&' || peek_next(expr) == '=') {
+ put_back(expr, ch);
+ break;
+ }
+ }
+}
+
+static void chk_exclusive_or(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_and(expr);
+ ch = get_next_skip_ws(expr);
+
+ if (ch != '^' || peek_next(expr) == '=') {
+ put_back(expr, ch);
+ break;
+ }
+ }
+}
+
+static void chk_inclusive_or(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_exclusive_or(expr);
+ ch = get_next_skip_ws(expr);
+
+ if (ch != '|' || peek_next(expr) == '|' || peek_next(expr) == '=') {
+ put_back(expr, ch);
+ break;
+ }
+ }
+}
+
+static void chk_logical_and(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_inclusive_or(expr);
+ ch = get_next_skip_ws(expr);
+
+ if (ch != '&' || peek_next(expr) != '&') {
+ put_back(expr, ch);
+ break;
+ }
+
+ match_hard(expr, '&');
+ }
+}
+
+static void chk_logical_or(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_logical_and(expr);
+ ch = get_next_skip_ws(expr);
+
+ if (ch != '|' || peek_next(expr) != '|') {
+ put_back(expr, ch);
+ break;
+ }
+
+ match_hard(expr, '|');
+ }
+}
+
+static void chk_conditional(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_logical_or(expr);
+ ch = get_next_skip_ws(expr);
+
+ if (ch != '?') {
+ put_back(expr, ch);
+ break;
+ }
+
+ chk_comma(expr);
+
+ skip_ws(expr);
+ match_hard(expr, ':');
+ }
+}
+
+static void chk_assignment(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_conditional(expr);
+ ch = get_next_skip_ws(expr);
+
+ switch (ch) {
+ case '=':
+ break;
+ case '*': case '/': case '%':
+ case '+': case '-': case '&':
+ case '^': case '|':
+ match_hard(expr, '=');
+ break;
+ case '<':
+ match_hard(expr, '<');
+ match_hard(expr, '=');
+ break;
+ case '>':
+ match_hard(expr, '>');
+ match_hard(expr, '=');
+ break;
+ case 0:
+ default:
+ put_back(expr, ch);
+ return;
+ }
+ set_effect(expr, sfx_certain);
+ }
+}
+
+static void chk_comma(context_t *expr)
+{
+ for (;;) {
+ int ch;
+
+ chk_assignment(expr);
+ ch = get_next_skip_ws(expr);
+
+ if (ch != ',') {
+ put_back(expr, ch);
+ break;
+ }
+ }
+}
+
+/*
+ * This function returns 1 if the expression is successfully parsed,
+ * or 0 if there is a syntax error.
+ *
+ * The object pointed to by eff is set to indicate the side effect ranking of
+ * the parsed expression: sfx_none, sfx_potential and sfx_certain. These
+ * rankins mean, respectively, that there are no side effects, that there are
+ * potential side effects, or that there certainly are side effects.
+ */
+
+int sfx_determine(const char *expr, sfx_rating_t *eff)
+{
+ static const except_id_t catch[] = { { SFX_EX, XCEPT_CODE_ANY } };
+ except_t *ex;
+ context_t ctx;
+ volatile int retval = 1;
+
+ if (!except_init())
+ return 0;
+
+ init_context(&ctx, (const unsigned char *) expr);
+
+ except_try_push(catch, 1, &ex);
+
+ if (ex == 0) {
+ chk_comma(&ctx);
+ skip_ws(&ctx);
+ if (peek_next(&ctx) != 0)
+ syntax_error();
+ } else {
+ /* exception caught */
+ retval = 0;
+ }
+
+ except_try_pop();
+
+ *eff = ctx.eff;
+
+ except_deinit();
+
+ return retval;
+}
+
+
+#ifdef KAZLIB_POSIX_THREADS
+
+static pthread_once_t cache_init;
+static pthread_mutex_t cache_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+#define init_once(X, Y) pthread_once(X, Y)
+#define lock_cache() pthread_mutex_lock(&cache_mutex)
+#define unlock_cache() pthread_mutex_unlock(&cache_mutex)
+
+#else
+static int cache_init;
+
+static void init_once(int *once, void (*func)(void))
+{
+ if (*once == 0) {
+ func();
+ *once = 1;
+ }
+}
+
+#define lock_cache()
+#define unlock_cache()
+#endif
+
+static hash_t *cache;
+
+extern hash_t *hash_create(hashcount_t, hash_comp_t, hash_fun_t);
+
+static void init_cache(void)
+{
+ cache = hash_create(HASHCOUNT_T_MAX, 0, 0);
+}
+
+static int lookup_cache(const char *expr, sfx_rating_t *rating)
+{
+ hnode_t *cache_node;
+ init_once(&cache_init, init_cache);
+
+ lock_cache();
+
+ cache_node = hash_lookup(cache, expr);
+
+ unlock_cache();
+
+ if (cache_node != 0) {
+ sfx_entry_t *cache_entry = hnode_get(cache_node);
+ *rating = cache_entry->eff;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int cache_result(const char *expr, sfx_rating_t rating)
+{
+ int result = 0;
+ hnode_t *cache_node;
+
+ init_once(&cache_init, init_cache);
+
+ if (cache == 0)
+ goto bail;
+
+ lock_cache();
+
+ cache_node = hash_lookup(cache, expr);
+
+ if (!cache_node) {
+ sfx_entry_t *cache_entry = malloc(sizeof *cache_entry);
+
+ if (cache_entry == 0)
+ goto bail_unlock;
+
+ hnode_init(&cache_entry->node, cache_entry);
+ cache_entry->expr = expr;
+ cache_entry->eff = rating;
+ hash_insert(cache, &cache_entry->node, expr);
+ } else {
+ sfx_entry_t *cache_entry = hnode_get(cache_node);
+ cache_entry->eff = rating;
+ result = 1;
+ }
+
+ result = 1;
+
+
+bail_unlock:
+ unlock_cache();
+
+bail:
+ return result;
+}
+
+
+void sfx_check(const char *expr, const char *file, unsigned long line)
+{
+ sfx_rating_t eff;
+ int success = lookup_cache(expr, &eff);
+
+ if (!success) {
+ success = sfx_determine(expr, &eff);
+ cache_result(expr, eff);
+ }
+
+ if (!success) {
+ fprintf(stderr, "%s:%ld: syntax error in expression \"%s\"\n",
+ file, line, expr);
+ } else if (eff == sfx_potential) {
+ fprintf(stderr, "%s:%ld: expression \"%s\" may have side effects\n",
+ file, line, expr);
+ } else if (eff == sfx_certain) {
+ fprintf(stderr, "%s:%ld: expression \"%s\" has side effects\n",
+ file, line, expr);
+ } else {
+ return;
+ }
+}
+
+int sfx_declare(const char *expr, sfx_rating_t eff)
+{
+ return cache_result(expr, eff);
+}
+
diff --git a/libutil/kazlib/sfx.h b/libutil/kazlib/sfx.h
new file mode 100644
index 0000000..b2a485c
--- /dev/null
+++ b/libutil/kazlib/sfx.h
@@ -0,0 +1,46 @@
+/*
+ * SideChk---A utility which tries to determine whether a given C expression
+ * is free of side effects. This can be used for verifying that macros which
+ * expand their arguments more than once are not being accidentally misused.
+ *
+ * Copyright (C) 1999 Kaz Kylheku <kaz at ashi.footprints.net>
+ *
+ * Free Software License:
+ *
+ * All rights are reserved by the author, with the following exceptions:
+ * Permission is granted to freely reproduce and distribute this software,
+ * possibly in exchange for a fee, provided that this copyright notice appears
+ * intact. Permission is also granted to adapt this software to produce
+ * derivative works, as long as the modified versions carry this copyright
+ * notice and additional notices stating that the work has been modified.
+ * This source code may be translated into executable form and incorporated
+ * into proprietary software; there is no requirement for such software to
+ * contain a copyright notice related to this source.
+ *
+ */
+
+#ifndef SFX_H
+#define SFX_H
+
+#include <assert.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ sfx_none, sfx_potential, sfx_certain
+} sfx_rating_t;
+
+int sfx_determine(const char *, sfx_rating_t *);
+int sfx_declare(const char *, sfx_rating_t);
+void sfx_check(const char *, const char *, unsigned long);
+
+#ifdef __cplusplus
+}
+#endif
+
+#define SFX_CHECK(E) (sfx_check(#E, __FILE__, __LINE__), (E))
+#define SFX_STRING(E) #E
+
+#endif
diff --git a/libutil/logMsg.H b/libutil/logMsg.H
new file mode 100644
index 0000000..4277ea1
--- /dev/null
+++ b/libutil/logMsg.H
@@ -0,0 +1,115 @@
+#ifndef LOGMSG_H
+#define LOGMSG_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+#include <errno.h>
+
+#include "util.h"
+
+class logMsg {
+public:
+ logMsg(bool toScreen=false, uint32 r=8192) {
+ _logLen = 0;
+ _logMax = r;
+ _log = new char [_logMax];
+ _resize = r;
+ _toScreenToo = toScreen;
+ };
+
+
+ ~logMsg() {
+ delete [] _log;
+ };
+
+
+ void setResize(uint32 r) {
+ _resize = r;
+ };
+
+
+ // Ensure that the string has at least 'moreSpace' available.
+ //
+ void resize(uint32 moreSpace) {
+ if (_logLen + moreSpace < _logMax)
+ return;
+
+ _logMax += _logMax + moreSpace + 1;
+ char *ll = new char [_logMax];
+ memcpy(ll, _log, sizeof(char) * _logLen);
+ delete [] _log;
+ _log = ll;
+ };
+
+
+ // Add a message to the log, assume the message is less than 8192 bytes. Would be nice to parse
+ // the fmt string (and any args) but that's a lot of work (and already done if you have
+ // vsnprintf.
+ //
+ // It warns if you overwrote memory.
+ //
+ void add(char const *fmt, ...) {
+ va_list ap;
+
+ resize(_resize);
+
+ if (_toScreenToo) {
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ }
+
+ // Reinit the ap, since it seems to get 'used up' if _toScreenToo is set.
+
+ va_start(ap, fmt);
+ _logLen += vsprintf(_log + _logLen, fmt, ap);
+ va_end(ap);
+
+ if (_logLen > _logMax)
+ fprintf(stderr,
+ "logMsg::add()-- HEY! I wrote "uint32FMT" bytes beyond the end of the buffer!\n"
+ "logMsg::add()-- This program will probably crash soon....\n\n%s\n\n",
+ _logLen - _logMax, _log);
+ };
+
+
+ // Dump the message to a file, taking care of errors.
+ //
+ void write(int file, char const *name=0L) {
+ errno = 0;
+ ::write(file, _log, sizeof(char) * _logLen);
+ if (errno) {
+ fprintf(stderr, "logMsg::write()-- Couldn't write to the log message file '%s': %s\n",
+ name ? name : "(unknown)",
+ strerror(errno));
+ exit(1);
+ }
+ };
+
+ void fwrite(FILE *file, char const *name=0L) {
+ errno = 0;
+ ::fwrite(_log, sizeof(char), _logLen, file);
+ if (errno) {
+ fprintf(stderr, "logMsg::fwrite()-- Couldn't write to the log message file '%s': %s\n",
+ name ? name : "(unknown)",
+ strerror(errno));
+ exit(1);
+ }
+ };
+
+
+private:
+ uint32 _logLen;
+ uint32 _logMax;
+ char *_log;
+ uint32 _resize;
+ bool _toScreenToo;
+};
+
+
+#endif // LOGMSG_H
diff --git a/libutil/md5.c b/libutil/md5.c
new file mode 100644
index 0000000..6dd5ab6
--- /dev/null
+++ b/libutil/md5.c
@@ -0,0 +1,441 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include "util.h"
+
+// The RSA MD5 implementation. Functions md5_* (at the end) are glue
+// to kmer libutil.
+
+// See RFC1321, "The MD5 Message-Digest Algorithm", R. Rivest.
+
+// Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+// rights reserved.
+//
+// License to copy and use this software is granted provided that it
+// is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+// Algorithm" in all material mentioning or referencing this software
+// or this function.
+//
+// License is also granted to make and use derivative works provided
+// that such works are identified as "derived from the RSA Data
+// Security, Inc. MD5 Message-Digest Algorithm" in all material
+// mentioning or referencing the derived work.
+//
+// RSA Data Security, Inc. makes no representations concerning either
+// the merchantability of this software or the suitability of this
+// software for any particular purpose. It is provided "as is"
+// without express or implied warranty of any kind.
+//
+// These notices must be retained in any copies of any part of this
+// documentation and/or software.
+
+
+typedef struct {
+ uint32 state[4]; // state (ABCD)
+ uint32 count[2]; // number of bits, modulo 2^64 (lsb first)
+ unsigned char buffer[64]; // input buffer
+} MD5_CTX;
+
+
+static void MD5Init(MD5_CTX *);
+static void MD5Update(MD5_CTX *, unsigned char const *, size_t);
+static void MD5Final(unsigned char [16], MD5_CTX *);
+
+static void MD5Transform(uint32 [4], unsigned char const [64]);
+static void Encode(unsigned char *, uint32 *, unsigned int);
+static void Decode(uint32 *, unsigned char const *, unsigned int);
+
+// Constants for MD5Transform routine.
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+static unsigned char PADDING[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+// F, G, H and I are basic MD5 functions.
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+// ROTATE_LEFT rotates x left n bits.
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+// Rotation is separate from addition to prevent recomputation.
+#define FF(a, b, c, d, x, s, ac) { \
+ (a) += F ((b), (c), (d)) + (x) + (uint32)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define GG(a, b, c, d, x, s, ac) { \
+ (a) += G ((b), (c), (d)) + (x) + (uint32)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define HH(a, b, c, d, x, s, ac) { \
+ (a) += H ((b), (c), (d)) + (x) + (uint32)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define II(a, b, c, d, x, s, ac) { \
+ (a) += I ((b), (c), (d)) + (x) + (uint32)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+
+// MD5 initialization. Begins an MD5 operation, writing a new context.
+//
+void MD5Init (MD5_CTX *context) {
+ context->count[0] = context->count[1] = 0;
+ // Load magic initialization constants.
+ context->state[0] = 0x67452301;
+ context->state[1] = 0xefcdab89;
+ context->state[2] = 0x98badcfe;
+ context->state[3] = 0x10325476;
+}
+
+// MD5 block update operation. Continues an MD5 message-digest
+// operation, processing another message block, and updating the
+// context.
+//
+void MD5Update (MD5_CTX *context, unsigned char const *input, size_t inputLen) {
+ unsigned int i, index, partLen;
+
+ // Compute number of bytes mod 64
+ index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+ // Update number of bits
+ if ((context->count[0] += ((uint32)inputLen << 3))
+ < ((uint32)inputLen << 3))
+ context->count[1]++;
+ context->count[1] += ((uint32)inputLen >> 29);
+
+ partLen = 64 - index;
+
+ // Transform as many times as possible.
+ if (inputLen >= partLen) {
+ memcpy(&context->buffer[index], input, partLen);
+ MD5Transform(context->state, context->buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+ MD5Transform(context->state, &input[i]);
+
+ index = 0;
+ }
+ else
+ i = 0;
+
+ // Buffer remaining input
+ memcpy(&context->buffer[index], &input[i], inputLen-i);
+}
+
+// MD5 finalization. Ends an MD5 message-digest operation, writing the
+// the message digest and zeroizing the context.
+//
+void MD5Final (unsigned char digest[16], MD5_CTX *context) {
+ unsigned char bits[8];
+ unsigned int index, padLen;
+
+ // Save number of bits
+ Encode (bits, context->count, 8);
+
+ // Pad out to 56 mod 64.
+ index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+ padLen = (index < 56) ? (56 - index) : (120 - index);
+ MD5Update (context, PADDING, padLen);
+
+ // Append length (before padding)
+ MD5Update (context, bits, 8);
+ // Store state in digest
+ Encode (digest, context->state, 16);
+
+ // Zeroize sensitive information.
+ memset(context, 0, sizeof(*context));
+}
+
+// MD5 basic transformation. Transforms state based on block.
+//
+static void MD5Transform(uint32 state[4], unsigned char const block[64]) {
+ uint32 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+ Decode(x, block, 64);
+
+ // Round 1
+ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); // 1
+ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); // 2
+ FF (c, d, a, b, x[ 2], S13, 0x242070db); // 3
+ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); // 4
+ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); // 5
+ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); // 6
+ FF (c, d, a, b, x[ 6], S13, 0xa8304613); // 7
+ FF (b, c, d, a, x[ 7], S14, 0xfd469501); // 8
+ FF (a, b, c, d, x[ 8], S11, 0x698098d8); // 9
+ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); // 10
+ FF (c, d, a, b, x[10], S13, 0xffff5bb1); // 11
+ FF (b, c, d, a, x[11], S14, 0x895cd7be); // 12
+ FF (a, b, c, d, x[12], S11, 0x6b901122); // 13
+ FF (d, a, b, c, x[13], S12, 0xfd987193); // 14
+ FF (c, d, a, b, x[14], S13, 0xa679438e); // 15
+ FF (b, c, d, a, x[15], S14, 0x49b40821); // 16
+
+ // Round 2
+ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); // 17
+ GG (d, a, b, c, x[ 6], S22, 0xc040b340); // 18
+ GG (c, d, a, b, x[11], S23, 0x265e5a51); // 19
+ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); // 20
+ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); // 21
+ GG (d, a, b, c, x[10], S22, 0x2441453); // 22
+ GG (c, d, a, b, x[15], S23, 0xd8a1e681); // 23
+ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); // 24
+ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); // 25
+ GG (d, a, b, c, x[14], S22, 0xc33707d6); // 26
+ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); // 27
+ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); // 28
+ GG (a, b, c, d, x[13], S21, 0xa9e3e905); // 29
+ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); // 30
+ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); // 31
+ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); // 32
+
+ // Round 3
+ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); // 33
+ HH (d, a, b, c, x[ 8], S32, 0x8771f681); // 34
+ HH (c, d, a, b, x[11], S33, 0x6d9d6122); // 35
+ HH (b, c, d, a, x[14], S34, 0xfde5380c); // 36
+ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); // 37
+ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); // 38
+ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); // 39
+ HH (b, c, d, a, x[10], S34, 0xbebfbc70); // 40
+ HH (a, b, c, d, x[13], S31, 0x289b7ec6); // 41
+ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); // 42
+ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); // 43
+ HH (b, c, d, a, x[ 6], S34, 0x4881d05); // 44
+ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); // 45
+ HH (d, a, b, c, x[12], S32, 0xe6db99e5); // 46
+ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); // 47
+ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); // 48
+
+ // Round 4
+ II (a, b, c, d, x[ 0], S41, 0xf4292244); // 49
+ II (d, a, b, c, x[ 7], S42, 0x432aff97); // 50
+ II (c, d, a, b, x[14], S43, 0xab9423a7); // 51
+ II (b, c, d, a, x[ 5], S44, 0xfc93a039); // 52
+ II (a, b, c, d, x[12], S41, 0x655b59c3); // 53
+ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); // 54
+ II (c, d, a, b, x[10], S43, 0xffeff47d); // 55
+ II (b, c, d, a, x[ 1], S44, 0x85845dd1); // 56
+ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); // 57
+ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); // 58
+ II (c, d, a, b, x[ 6], S43, 0xa3014314); // 59
+ II (b, c, d, a, x[13], S44, 0x4e0811a1); // 60
+ II (a, b, c, d, x[ 4], S41, 0xf7537e82); // 61
+ II (d, a, b, c, x[11], S42, 0xbd3af235); // 62
+ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); // 63
+ II (b, c, d, a, x[ 9], S44, 0xeb86d391); // 64
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+
+ // Zeroize sensitive information.
+ memset (x, 0, sizeof(x));
+}
+
+// Encodes input (uint32) into output (unsigned char). Assumes len is
+// a multiple of 4.
+//
+static void Encode (unsigned char *output, uint32 *input, unsigned int len) {
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (unsigned char)(input[i] & 0xff);
+ output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+ output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+ output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+ }
+}
+
+// Decodes input (unsigned char) into output (uint32). Assumes len is
+// a multiple of 4.
+//
+static void Decode (uint32 *output, unsigned char const *input, unsigned int len) {
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4)
+ output[i] = ((uint32)input[j]) | (((uint32)input[j+1]) << 8) |
+ (((uint32)input[j+2]) << 16) | (((uint32)input[j+3]) << 24);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// kmer glue functions
+//
+////////////////////////////////////////////////////////////////////////////////
+
+
+int
+md5_compare(void const *a, void const *b) {
+ md5_s const *A = (md5_s const *)a;
+ md5_s const *B = (md5_s const *)b;
+
+ if (A->a < B->a) return(-1);
+ if (A->a > B->a) return(1);
+ if (A->b < B->b) return(-1);
+ if (A->b > B->b) return(1);
+ return(0);
+}
+
+static const char *md5_letters = "0123456789abcdef";
+
+char*
+md5_toascii(md5_s *m, char *s) {
+ int i;
+ for (i=0; i<16; i++) {
+ s[15-i ] = md5_letters[(m->a >> 4*i) & 0x0f];
+ s[15-i+16] = md5_letters[(m->b >> 4*i) & 0x0f];
+ }
+ s[32] = 0;
+
+ return(s);
+}
+
+md5_s*
+md5_string(md5_s *m, char *s, uint32 l) {
+ MD5_CTX ctx;
+ unsigned char dig[16];
+ int i = 0;
+
+ if (m == NULL) {
+ errno = 0;
+ m = (md5_s *)malloc(sizeof(md5_s));
+ if (errno) {
+ fprintf(stderr, "md5_string()-- Can't allocate a md5_s.\n%s\n", strerror(errno));
+ exit(1);
+ }
+ }
+
+ MD5Init(&ctx);
+ MD5Update(&ctx, (unsigned char*)s, l);
+ MD5Final(dig, &ctx);
+
+ m->a = dig[0];
+ while (i<8) {
+ m->a <<= 8;
+ m->a |= dig[i++];
+ }
+
+ m->b = dig[i++];
+ while (i<16) {
+ m->b <<= 8;
+ m->b |= dig[i++];
+ }
+
+ return(m);
+}
+
+static
+md5_increment_s*
+md5_increment_initialize(void) {
+ md5_increment_s *m;
+
+ errno = 0;
+ m = (md5_increment_s *)malloc(sizeof(md5_increment_s));
+ if (errno) {
+ fprintf(stderr, "md5_increment_*()-- Can't allocate a md5_increment_s.\n%s\n", strerror(errno));
+ exit(1);
+ }
+
+ m->context = (MD5_CTX *)malloc(sizeof(MD5_CTX));
+ if (errno) {
+ fprintf(stderr, "md5_increment_*()-- Can't allocate a md5 context.\n%s\n", strerror(errno));
+ exit(1);
+ }
+ MD5Init((MD5_CTX *)m->context);
+
+ m->bufferPos = 0;
+
+ return(m);
+}
+
+md5_increment_s*
+md5_increment_char(md5_increment_s *m, char s) {
+
+ if (m == NULL)
+ m = md5_increment_initialize();
+
+ m->buffer[m->bufferPos++] = s;
+
+ if (m->bufferPos == MD5_BUFFER_SIZE) {
+ MD5Update((MD5_CTX *)m->context, m->buffer, m->bufferPos);
+ m->bufferPos = 0;
+ }
+
+ return(m);
+}
+
+md5_increment_s*
+md5_increment_block(md5_increment_s *m, char *s, uint32 l) {
+
+ if (m == NULL)
+ m = md5_increment_initialize();
+
+ MD5Update((MD5_CTX *)m->context, (unsigned char*)s, l);
+
+ return(m);
+}
+
+void
+md5_increment_finalize(md5_increment_s *m) {
+ MD5_CTX *ctx = (MD5_CTX *)m->context;
+ unsigned char dig[16];
+ int i = 0;
+
+ if (m->bufferPos > 0) {
+ MD5Update((MD5_CTX *)m->context, m->buffer, m->bufferPos);
+ m->bufferPos = 0;
+ }
+
+ MD5Final(dig, ctx);
+
+ m->a = dig[0];
+ while (i<8) {
+ m->a <<= 8;
+ m->a |= dig[i++];
+ }
+
+ m->b = dig[i++];
+ while (i<16) {
+ m->b <<= 8;
+ m->b |= dig[i++];
+ }
+
+ m->context = 0L;
+
+ free(ctx);
+}
+
+void
+md5_increment_destroy(md5_increment_s *m) {
+ free(m);
+}
diff --git a/libutil/mt19937ar/Make.include b/libutil/mt19937ar/Make.include
new file mode 100644
index 0000000..a9ead6d
--- /dev/null
+++ b/libutil/mt19937ar/Make.include
@@ -0,0 +1,23 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../../libutil/)/
+MTDIR/ :=${LIBUTL/}mt19937ar/
+
+$/.C_EXES := $/mt19937ar-test
+$/.C_SRCS := $/mt19937ar.c $/test.c
+$/.C_LIBS := $/libmt19937ar.a
+
+$/.CLEAN := $/*.o $/test.c $/diffs
+$/.REAL-CLEAN := $/*.o $/test.c $/diffs $/mt19937ar-test
+
+$/libmt19937ar.a: $/mt19937ar.o $/test.o
+
+$/mt19937ar-test: $/mt19937ar.o $/mt19937ar-test.o
+
+$/test.c: $/mt19937ar-test
+ ${MTDIR/}mt19937ar-test | diff - ${MTDIR/}mt19937ar.out > ${MTDIR/}diffs 2>&1
+ if test -s ${MTDIR/}diffs ; then echo 'MT19937: TEST FAILED'; else echo 'MT19937: Test Passed'; fi
+ touch ${MTDIR/}test.c
+ ${MTDIR/}mt19937ar-test | diff - ${MTDIR/}mt19937ar.out
+
+#$(eval $/%.d $/%.o: CFLAGS+= -I..)
diff --git a/libutil/mt19937ar/mt19937ar-test.c b/libutil/mt19937ar/mt19937ar-test.c
new file mode 100644
index 0000000..eef7b12
--- /dev/null
+++ b/libutil/mt19937ar/mt19937ar-test.c
@@ -0,0 +1,38 @@
+#include "mt19937ar.h"
+
+// The MD5 checksum of the correct output is
+// cb33e6acc162cbe20f7fcac26adddd02
+// and it is 22465 bytes long.
+//
+// but we cannot use md5, as it's in libbri, and
+// so is this...
+
+int main(void) {
+ int i;
+ uint32 init[4] = {0x123, 0x234, 0x345, 0x456};
+ uint32 length = 4;
+ mt_s *ctx = mtInitArray(init, length);
+
+ printf("1000 outputs of genrand_int32()\n");
+
+ for (i=0; i<1000; i++) {
+ printf(uint32FMTW(10)" ", mtRandom32(ctx));
+ if (i%5==4) printf("\n");
+ }
+
+ printf("\n1000 outputs of genrand_real2()\n");
+
+ for (i=0; i<1000; i++) {
+ printf("%10.8f ", mtRandomRealOpen(ctx));
+ if (i%5==4) printf("\n");
+ }
+
+
+
+ for (i=0; i<999; i++) {
+ printf(uint64HEX" ", mtRandom64(ctx));
+ if (i%3==2) printf("\n");
+ }
+
+ return 0;
+}
diff --git a/libutil/mt19937ar/mt19937ar.c b/libutil/mt19937ar/mt19937ar.c
new file mode 100644
index 0000000..e2f2fec
--- /dev/null
+++ b/libutil/mt19937ar/mt19937ar.c
@@ -0,0 +1,189 @@
+/*
+ A C-program for MT19937, with initialization improved 2002/1/26.
+ Coded by Takuji Nishimura and Makoto Matsumoto.
+
+ Before using, initialize the state by using init_genrand(seed)
+ or init_by_array(init_key, key_length).
+
+ Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The names of its contributors may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+ Any feedback is very welcome.
+ http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+*/
+
+#include "mt19937ar.h"
+
+#include <stdlib.h>
+#include <math.h>
+
+
+// Buried in genrand_in32 was this:
+// if init_genrand() has not been called,
+// a default initial seed is used
+//
+// if (ctx->mti == N+1)
+// init_genrand(5489UL);
+//
+// But we don't need that anymore, as we require for
+// thread-safety that init_genrand be called.
+
+
+
+
+// initialize with a single seed
+mt_s*
+mtInit(uint32 s) {
+ mt_s *ctx = (mt_s *)malloc(sizeof(mt_s));
+ if (ctx == NULL)
+ return(NULL);
+
+ ctx->mt[0] = s;
+
+ // See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier.
+ // In the previous versions, MSBs of the seed affect
+ // only MSBs of the array mt[].
+ // 2002/01/09 modified by Makoto Matsumoto
+
+ for (ctx->mti=1; ctx->mti<MT_N; ctx->mti++)
+ ctx->mt[ctx->mti] = (1812433253UL * (ctx->mt[ctx->mti-1] ^ (ctx->mt[ctx->mti-1] >> 30)) + ctx->mti);
+
+ ctx->mag01[0] = uint32ZERO;
+ ctx->mag01[1] = MT_MATRIX_A;
+
+ return(ctx);
+}
+
+
+
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+mt_s*
+mtInitArray(uint32 *init_key, uint32 key_length) {
+
+ mt_s *ctx = mtInit(19650218UL);
+ int i = 1;
+ int j = 0;
+ int k = (MT_N > key_length ? MT_N : key_length);
+
+ for (; k; k--) {
+ ctx->mt[i] = (ctx->mt[i] ^ ((ctx->mt[i-1] ^ (ctx->mt[i-1] >> 30)) * 1664525UL)) + init_key[j] + j; /* non linear */
+ i++;
+ j++;
+ if (i >= MT_N) {
+ ctx->mt[0] = ctx->mt[MT_N-1];
+ i=1;
+ }
+ if (j >= key_length)
+ j=0;
+ }
+ for (k=MT_N-1; k; k--) {
+ ctx->mt[i] = (ctx->mt[i] ^ ((ctx->mt[i-1] ^ (ctx->mt[i-1] >> 30)) * 1566083941UL)) - i; /* non linear */
+ i++;
+ if (i>=MT_N) {
+ ctx->mt[0] = ctx->mt[MT_N-1];
+ i=1;
+ }
+ }
+
+ ctx->mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
+ return(ctx);
+}
+
+
+
+/* generates a random number on [0,0xffffffff]-interval */
+uint32
+mtRandom32(mt_s *ctx) {
+ uint32 y;
+
+ // generate MT_N words at one time
+ //
+ if (ctx->mti >= MT_N) {
+ int kk;
+
+ for (kk=0; kk < MT_N - MT_M; kk++) {
+ y = (ctx->mt[kk] & MT_UPPER_MASK) | (ctx->mt[kk+1] & MT_LOWER_MASK);
+ ctx->mt[kk] = ctx->mt[kk + MT_M] ^ (y >> 1) ^ ctx->mag01[y & uint32ONE];
+ }
+ for (; kk < MT_N-1; kk++) {
+ y = (ctx->mt[kk] & MT_UPPER_MASK) | (ctx->mt[kk + 1] & MT_LOWER_MASK);
+ ctx->mt[kk] = ctx->mt[kk + (MT_M - MT_N)] ^ (y >> 1) ^ ctx->mag01[y & uint32ONE];
+ }
+ y = (ctx->mt[MT_N-1] & MT_UPPER_MASK) | (ctx->mt[0] & MT_LOWER_MASK);
+ ctx->mt[MT_N-1] = ctx->mt[MT_M-1] ^ (y >> 1) ^ ctx->mag01[y & uint32ONE];
+
+ ctx->mti = 0;
+ }
+
+ y = ctx->mt[ctx->mti++];
+
+ /* Tempering */
+ y ^= (y >> 11);
+ y ^= (y << 7) & 0x9d2c5680UL;
+ y ^= (y << 15) & 0xefc60000UL;
+ y ^= (y >> 18);
+
+ return y;
+}
+
+
+// generates a random number on gaussian distribution with 0 median and 1 std.dev.
+double
+mtRandomGaussian(mt_s *mt) {
+ double x1=0, x2=0, w=0, y1=0, y2=0;
+
+ // from http://www.taygeta.com/random/gaussian.html
+ //
+ // supposedly equivalent to
+ //
+ // y1 = sqrt(-2*ln(x1)) cos(2*pi*x2)
+ // y2 = sqrt(-2*ln(x1)) sin(2*pi*x2)
+ //
+ // but stable when x1 close to zero
+
+ do {
+ x1 = 2.0 * mtRandomRealClosed(mt) - 1.0;
+ x2 = 2.0 * mtRandomRealClosed(mt) - 1.0;
+ w = x1 * x1 + x2 * x2;
+ } while (w >= 1.0);
+
+ w = sqrt( (-2.0 * log(w)) / w);
+
+ y1 = x1 * w;
+ y2 = x2 * w;
+
+ return(y1);
+}
diff --git a/libutil/mt19937ar/mt19937ar.h b/libutil/mt19937ar/mt19937ar.h
new file mode 100644
index 0000000..504831f
--- /dev/null
+++ b/libutil/mt19937ar/mt19937ar.h
@@ -0,0 +1,47 @@
+#ifndef MT19937AR_H
+#define MT19937AR_H
+
+// Refactoring of
+//
+// A C-program for MT19937, with initialization improved 2002/1/26.
+// Coded by Takuji Nishimura and Makoto Matsumoto.
+//
+// to make it thread safe and (hopefully) more portable.
+//
+// 20040421, bpw
+
+// bri.h contains the function prototypes, but we hide the structure and
+// implementation here.
+//
+#include "../util.h"
+
+/* Period parameters */
+#define MT_N 624
+#define MT_M 397
+#define MT_MATRIX_A 0x9908b0dfUL /* constant vector a */
+#define MT_UPPER_MASK 0x80000000UL /* most significant w-r bits */
+#define MT_LOWER_MASK 0x7fffffffUL /* least significant r bits */
+
+
+struct mtctx {
+ // The array for the state vector
+ //
+ uint32 mt[MT_N];
+
+ // The ordinal of the first uninitialized element --
+ // mti = N+1 -> element N is uninitialized
+ //
+ uint32 mti;
+
+ // Something
+ // mag01[x] = x * MT_MATRIX_A for x=0,1
+ //
+ uint32 mag01[2];
+};
+
+// This is declared in util.h
+//
+//typedef struct mt mt_s;
+
+
+#endif // MT19937AR_H
diff --git a/libutil/mt19937ar/mt19937ar.out b/libutil/mt19937ar/mt19937ar.out
new file mode 100644
index 0000000..03834bd
--- /dev/null
+++ b/libutil/mt19937ar/mt19937ar.out
@@ -0,0 +1,736 @@
+1000 outputs of genrand_int32()
+1067595299 955945823 477289528 4107218783 4228976476
+3344332714 3355579695 227628506 810200273 2591290167
+2560260675 3242736208 646746669 1479517882 4245472273
+1143372638 3863670494 3221021970 1773610557 1138697238
+1421897700 1269916527 2859934041 1764463362 3874892047
+3965319921 72549643 2383988930 2600218693 3237492380
+2792901476 725331109 605841842 271258942 715137098
+3297999536 1322965544 4229579109 1395091102 3735697720
+2101727825 3730287744 2950434330 1661921839 2895579582
+2370511479 1004092106 2247096681 2111242379 3237345263
+4082424759 219785033 2454039889 3709582971 835606218
+2411949883 2735205030 756421180 2175209704 1873865952
+2762534237 4161807854 3351099340 181129879 3269891896
+ 776029799 2218161979 3001745796 1866825872 2133627728
+ 34862734 1191934573 3102311354 2916517763 1012402762
+2184831317 4257399449 2899497138 3818095062 3030756734
+1282161629 420003642 2326421477 2741455717 1278020671
+3744179621 271777016 2626330018 2560563991 3055977700
+4233527566 1228397661 3595579322 1077915006 2395931898
+1851927286 3013683506 1999971931 3006888962 1049781534
+1488758959 3491776230 104418065 2448267297 3075614115
+3872332600 891912190 3936547759 2269180963 2633455084
+1047636807 2604612377 2709305729 1952216715 207593580
+2849898034 670771757 2210471108 467711165 263046873
+3569667915 1042291111 3863517079 1464270005 2758321352
+3790799816 2301278724 3106281430 7974801 2792461636
+ 555991332 621766759 1322453093 853629228 686962251
+1455120532 957753161 1802033300 1021534190 3486047311
+1902128914 3701138056 4176424663 1795608698 560858864
+3737752754 3141170998 1553553385 3367807274 711546358
+2475125503 262969859 251416325 2980076994 1806565895
+ 969527843 3529327173 2736343040 2987196734 1649016367
+2206175811 3048174801 3662503553 3138851612 2660143804
+1663017612 1816683231 411916003 3887461314 2347044079
+1015311755 1203592432 2170947766 2569420716 813872093
+1105387678 1431142475 220570551 4243632715 4179591855
+2607469131 3090613241 282341803 1734241730 1391822177
+1001254810 827927915 1886687171 3935097347 2631788714
+3905163266 110554195 2447955646 3717202975 3304793075
+3739614479 3059127468 953919171 2590123714 1132511021
+3795593679 2788030429 982155079 3472349556 859942552
+2681007391 2299624053 647443547 233600422 608168955
+3689327453 1849778220 1608438222 3968158357 2692977776
+2851872572 246750393 3582818628 3329652309 4036366910
+1012970930 950780808 3959768744 2538550045 191422718
+2658142375 3276369011 2927737484 1234200027 1920815603
+3536074689 1535612501 2184142071 3276955054 428488088
+2378411984 4059769550 3913744741 2732139246 64369859
+3755670074 842839565 2819894466 2414718973 1010060670
+1839715346 2410311136 152774329 3485009480 4102101512
+2852724304 879944024 1785007662 2748284463 1354768064
+3267784736 2269127717 3001240761 3179796763 895723219
+ 865924942 4291570937 89355264 1471026971 4114180745
+3201939751 2867476999 2460866060 3603874571 2238880432
+3308416168 2072246611 2755653839 3773737248 1709066580
+4282731467 2746170170 2832568330 433439009 3175778732
+ 26248366 2551382801 183214346 3893339516 1928168445
+1337157619 3429096554 3275170900 1782047316 4264403756
+1876594403 4289659572 3223834894 1728705513 4068244734
+2867840287 1147798696 302879820 1730407747 1923824407
+1180597908 1569786639 198796327 560793173 2107345620
+2705990316 3448772106 3678374155 758635715 884524671
+ 486356516 1774865603 3881226226 2635213607 1181121587
+1508809820 3178988241 1594193633 1235154121 326117244
+2304031425 937054774 2687415945 3192389340 2003740439
+1823766188 2759543402 10067710 1533252662 4132494984
+ 82378136 420615890 3467563163 541562091 3535949864
+2277319197 3330822853 3215654174 4113831979 4204996991
+2162248333 3255093522 2219088909 2978279037 255818579
+2859348628 3097280311 2569721123 1861951120 2907080079
+2719467166 998319094 2521935127 2404125338 259456032
+2086860995 1839848496 1893547357 2527997525 1489393124
+2860855349 76448234 2264934035 744914583 2586791259
+1385380501 66529922 1819103258 1899300332 2098173828
+1793831094 276463159 360132945 4178212058 595015228
+ 177071838 2800080290 1573557746 1548998935 378454223
+1460534296 1116274283 3112385063 3709761796 827999348
+3580042847 1913901014 614021289 4278528023 1905177404
+ 45407939 3298183234 1184848810 3644926330 3923635459
+1627046213 3677876759 969772772 1160524753 1522441192
+ 452369933 1527502551 832490847 1003299676 1071381111
+2891255476 973747308 4086897108 1847554542 3895651598
+2227820339 1621250941 2881344691 3583565821 3510404498
+ 849362119 862871471 797858058 2867774932 2821282612
+3272403146 3997979905 209178708 1805135652 6783381
+2823361423 792580494 4263749770 776439581 3798193823
+2853444094 2729507474 1071873341 1329010206 1289336450
+3327680758 2011491779 80157208 922428856 1158943220
+1667230961 2461022820 2608845159 387516115 3345351910
+1495629111 4098154157 3156649613 3525698599 4134908037
+ 446713264 2137537399 3617403512 813966752 1157943946
+3734692965 1680301658 3180398473 3509854711 2228114612
+1008102291 486805123 863791847 3189125290 1050308116
+3777341526 4291726501 844061465 1347461791 2826481581
+ 745465012 2055805750 4260209475 2386693097 2980646741
+ 447229436 2077782664 1232942813 4023002732 1399011509
+3140569849 2579909222 3794857471 900758066 2887199683
+1720257997 3367494931 2668921229 955539029 3818726432
+1105704962 3889207255 2277369307 2746484505 1761846513
+2413916784 2685127085 4240257943 1166726899 4215215715
+3082092067 3960461946 1663304043 2087473241 4162589986
+2507310778 1579665506 767234210 970676017 492207530
+1441679602 1314785090 3262202570 3417091742 1561989210
+3011406780 1146609202 3262321040 1374872171 1634688712
+1280458888 2230023982 419323804 3262899800 39783310
+1641619040 1700368658 2207946628 2571300939 2424079766
+ 780290914 2715195096 3390957695 163151474 2309534542
+1860018424 555755123 280320104 1604831083 2713022383
+1728987441 3639955502 623065489 3828630947 4275479050
+3516347383 2343951195 2430677756 635534992 3868699749
+ 808442435 3070644069 4282166003 2093181383 2023555632
+1568662086 3422372620 4134522350 3016979543 3259320234
+2888030729 3185253876 4258779643 1267304371 1022517473
+ 815943045 929020012 2995251018 3371283296 3608029049
+2018485115 122123397 2810669150 1411365618 1238391329
+1186786476 3155969091 2242941310 1765554882 279121160
+4279838515 1641578514 3796324015 13351065 103516986
+1609694427 551411743 2493771609 1316337047 3932650856
+4189700203 463397996 2937735066 1855616529 2626847990
+ 55091862 3823351211 753448970 4045045500 1274127772
+1124182256 92039808 2126345552 425973257 386287896
+2589870191 1987762798 4084826973 2172456685 3366583455
+3602966653 2378803535 2901764433 3716929006 3710159000
+2653449155 3469742630 3096444476 3932564653 2595257433
+ 318974657 3146202484 853571438 144400272 3768408841
+ 782634401 2161109003 570039522 1886241521 14249488
+2230804228 1604941699 3928713335 3921942509 2155806892
+ 134366254 430507376 1924011722 276713377 196481886
+3614810992 1610021185 1785757066 851346168 3761148643
+2918835642 3364422385 3012284466 3735958851 2643153892
+3778608231 1164289832 205853021 2876112231 3503398282
+3078397001 3472037921 1748894853 2740861475 316056182
+1660426908 168885906 956005527 3984354789 566521563
+1001109523 1216710575 2952284757 3834433081 3842608301
+2467352408 3974441264 3256601745 1409353924 1329904859
+2307560293 3125217879 3622920184 3832785684 3882365951
+2308537115 2659155028 1450441945 3532257603 3186324194
+1225603425 1124246549 175808705 3009142319 2796710159
+3651990107 160762750 1902254979 1698648476 1134980669
+ 497144426 3302689335 4057485630 3603530763 4087252587
+ 427812652 286876201 823134128 1627554964 3745564327
+2589226092 4202024494 62878473 3275585894 3987124064
+2791777159 1916869511 2585861905 1375038919 1403421920
+ 60249114 3811870450 3021498009 2612993202 528933105
+2757361321 3341402964 2621861700 273128190 4015252178
+3094781002 1621621288 2337611177 1796718448 1258965619
+4241913140 2138560392 3022190223 4174180924 450094611
+3274724580 617150026 2704660665 1469700689 1341616587
+ 356715071 1188789960 2278869135 1766569160 2795896635
+ 57824704 2893496380 1235723989 1630694347 3927960522
+ 428891364 1814070806 2287999787 4125941184 3968103889
+3548724050 1025597707 1404281500 2002212197 92429143
+2313943944 2403086080 3006180634 3561981764 1671860914
+1768520622 1803542985 844848113 3006139921 1410888995
+1157749833 2125704913 1789979528 1799263423 741157179
+2405862309 767040434 2655241390 3663420179 2172009096
+2511931187 1680542666 231857466 1154981000 157168255
+1454112128 3505872099 1929775046 2309422350 2143329496
+2960716902 407610648 2938108129 2581749599 538837155
+2342628867 430543915 740188568 1937713272 3315215132
+2085587024 4030765687 766054429 3517641839 689721775
+1294158986 1753287754 4202601348 1974852792 33459103
+3568087535 3144677435 1686130825 4134943013 3005738435
+3599293386 426570142 754104406 3660892564 1964545167
+ 829466833 821587464 1746693036 1006492428 1595312919
+1256599985 1024482560 1897312280 2902903201 691790057
+1037515867 3176831208 1968401055 2173506824 1089055278
+1748401123 2941380082 968412354 1818753861 2973200866
+3875951774 1119354008 3988604139 1647155589 2232450826
+3486058011 3655784043 3759258462 847163678 1082052057
+ 989516446 2871541755 3196311070 3929963078 658187585
+3664944641 2175149170 2203709147 2756014689 2456473919
+3890267390 1293787864 2830347984 3059280931 4158802520
+1561677400 2586570938 783570352 1355506163 31495586
+3789437343 3340549429 2092501630 896419368 671715824
+3530450081 3603554138 1055991716 3442308219 1499434728
+3130288473 3639507000 17769680 2259741420 487032199
+4227143402 3693771256 1880482820 3924810796 381462353
+4017855991 2452034943 2736680833 2209866385 2128986379
+ 437874044 595759426 641721026 1636065708 3899136933
+ 629879088 3591174506 351984326 2638783544 2348444281
+2341604660 2123933692 143443325 1525942256 364660499
+ 599149312 939093251 1523003209 106601097 376589484
+1346282236 1297387043 764598052 3741218111 933457002
+1886424424 3219631016 525405256 3014235619 323149677
+2038881721 4100129043 2851715101 2984028078 1888574695
+2014194741 3515193880 4180573530 3461824363 2641995497
+3179230245 2902294983 2217320456 4040852155 1784656905
+3311906931 87498458 2752971818 2635474297 2831215366
+3682231106 2920043893 3772929704 2816374944 309949752
+2383758854 154870719 385111597 1191604312 1840700563
+ 872191186 2925548701 1310412747 2102066999 1504727249
+3574298750 1191230036 3330575266 3180292097 3539347721
+ 681369118 3305125752 3648233597 950049240 4173257693
+1760124957 512151405 681175196 580563018 1169662867
+4015033554 2687781101 699691603 2673494188 1137221356
+ 123599888 472658308 1053598179 1012713758 3481064843
+3759461013 3981457956 3830587662 1877191791 3650996736
+ 988064871 3515461600 4089077232 2225147448 1249609188
+2643151863 3896204135 2416995901 1397735321 3460025646
+
+1000 outputs of genrand_real2()
+0.76275443 0.99000644 0.98670464 0.10143112 0.27933125
+0.69867227 0.94218740 0.03427201 0.78842173 0.28180608
+0.92179002 0.20785655 0.54534773 0.69644020 0.38107718
+0.23978165 0.65286910 0.07514568 0.22765211 0.94872929
+0.74557914 0.62664415 0.54708246 0.90959343 0.42043116
+0.86334511 0.19189126 0.14718544 0.70259889 0.63426346
+0.77408121 0.04531601 0.04605807 0.88595519 0.69398270
+0.05377184 0.61711170 0.05565708 0.10133577 0.41500776
+0.91810699 0.22320679 0.23353705 0.92871862 0.98897234
+0.19786706 0.80558809 0.06961067 0.55840445 0.90479405
+0.63288060 0.95009721 0.54948447 0.20645042 0.45000959
+0.87050869 0.70806991 0.19406895 0.79286390 0.49332866
+0.78483914 0.75145146 0.12341941 0.42030252 0.16728160
+0.59906494 0.37575460 0.97815160 0.39815952 0.43595080
+0.04952478 0.33917805 0.76509902 0.61034321 0.90654701
+0.92915732 0.85365931 0.18812377 0.65913428 0.28814566
+0.59476081 0.27835931 0.60722542 0.68310435 0.69387186
+0.03699800 0.65897714 0.17527003 0.02889304 0.86777366
+0.12352068 0.91439461 0.32022990 0.44445731 0.34903686
+0.74639273 0.65918367 0.92492794 0.31872642 0.77749724
+0.85413832 0.76385624 0.32744211 0.91326300 0.27458185
+0.22190155 0.19865383 0.31227402 0.85321225 0.84243342
+0.78544200 0.71854080 0.92503892 0.82703064 0.88306297
+0.47284073 0.70059042 0.48003761 0.38671694 0.60465770
+0.41747204 0.47163243 0.72750808 0.65830223 0.10955369
+0.64215401 0.23456345 0.95944940 0.72822249 0.40888451
+0.69980355 0.26677428 0.57333635 0.39791582 0.85377858
+0.76962816 0.72004885 0.90903087 0.51376506 0.37732665
+0.12691640 0.71249738 0.81217908 0.37037313 0.32772374
+0.14238259 0.05614811 0.74363008 0.39773267 0.94859135
+0.31452454 0.11730313 0.62962618 0.33334237 0.45547255
+0.10089665 0.56550662 0.60539371 0.16027624 0.13245301
+0.60959939 0.04671662 0.99356286 0.57660859 0.40269560
+0.45274629 0.06699735 0.85064246 0.87742744 0.54508392
+0.87242982 0.29321385 0.67660627 0.68230715 0.79052073
+0.48592054 0.25186266 0.93769755 0.28565487 0.47219067
+0.99054882 0.13155240 0.47110470 0.98556600 0.84397623
+0.12875246 0.90953202 0.49129015 0.23792727 0.79481194
+0.44337770 0.96564297 0.67749118 0.55684872 0.27286897
+0.79538393 0.61965356 0.22487929 0.02226018 0.49248200
+0.42247006 0.91797788 0.99250134 0.23449967 0.52531508
+0.10246337 0.78685622 0.34310922 0.89892996 0.40454552
+0.68608407 0.30752487 0.83601319 0.54956031 0.63777550
+0.82199797 0.24890696 0.48801123 0.48661910 0.51223987
+0.32969635 0.31075073 0.21393155 0.73453207 0.15565705
+0.58584522 0.28976728 0.97621478 0.61498701 0.23891470
+0.28518540 0.46809591 0.18371914 0.37597910 0.13492176
+0.66849449 0.82811466 0.56240330 0.37548956 0.27562998
+0.27521910 0.74096121 0.77176757 0.13748143 0.99747138
+0.92504502 0.09175241 0.21389176 0.21766512 0.31183245
+0.23271221 0.21207367 0.57903312 0.77523344 0.13242613
+0.31037988 0.01204835 0.71652949 0.84487594 0.14982178
+0.57423142 0.45677888 0.48420169 0.53465428 0.52667473
+0.46880526 0.49849733 0.05670710 0.79022476 0.03872047
+0.21697212 0.20443086 0.28949326 0.81678186 0.87629474
+0.92297064 0.27373097 0.84625273 0.51505586 0.00582792
+0.33295971 0.91848412 0.92537226 0.91760033 0.07541125
+0.71745848 0.61158698 0.00941650 0.03135554 0.71527471
+0.24821915 0.63636652 0.86159918 0.26450229 0.60160194
+0.35557725 0.24477500 0.07186456 0.51757096 0.62120362
+0.97981062 0.69954667 0.21065616 0.13382753 0.27693186
+0.59644095 0.71500764 0.04110751 0.95730081 0.91600724
+0.47704678 0.26183479 0.34706971 0.07545431 0.29398385
+0.93236070 0.60486023 0.48015011 0.08870451 0.45548581
+0.91872718 0.38142712 0.10668643 0.01397541 0.04520355
+0.93822273 0.18011940 0.57577277 0.91427606 0.30911399
+0.95853475 0.23611214 0.69619891 0.69601980 0.76765372
+0.58515930 0.49479057 0.11288752 0.97187699 0.32095365
+0.57563608 0.40760618 0.78703383 0.43261152 0.90877651
+0.84686346 0.10599030 0.72872803 0.19315490 0.66152912
+0.10210518 0.06257876 0.47950688 0.47062066 0.72701157
+0.48915116 0.66110261 0.60170685 0.24516994 0.12726050
+0.03451185 0.90864994 0.83494878 0.94800035 0.91035206
+0.14480751 0.88458997 0.53498312 0.15963215 0.55378627
+0.35171349 0.28719791 0.09097957 0.00667896 0.32309622
+0.87561479 0.42534520 0.91748977 0.73908457 0.41793223
+0.99279792 0.87908370 0.28458072 0.59132853 0.98672190
+0.28547393 0.09452165 0.89910674 0.53681109 0.37931425
+0.62683489 0.56609740 0.24801549 0.52948179 0.98328855
+0.66403523 0.55523786 0.75886666 0.84784685 0.86829981
+0.71448906 0.84670080 0.43922919 0.20771016 0.64157936
+0.25664246 0.73055695 0.86395782 0.65852932 0.99061803
+0.40280575 0.39146298 0.07291005 0.97200603 0.20555729
+0.59616495 0.08138254 0.45796388 0.33681125 0.33989127
+0.18717090 0.53545811 0.60550838 0.86520709 0.34290701
+0.72743276 0.73023855 0.34195926 0.65019733 0.02765254
+0.72575740 0.32709576 0.03420866 0.26061893 0.56997511
+0.28439072 0.84422744 0.77637570 0.55982168 0.06720327
+0.58449067 0.71657369 0.15819609 0.58042821 0.07947911
+0.40193792 0.11376012 0.88762938 0.67532159 0.71223735
+0.27829114 0.04806073 0.21144026 0.58830274 0.04140071
+0.43215628 0.12952729 0.94668759 0.87391019 0.98382450
+0.27750768 0.90849647 0.90962737 0.59269720 0.96102026
+0.49544979 0.32007095 0.62585546 0.03119821 0.85953001
+0.22017528 0.05834068 0.80731217 0.53799961 0.74166948
+0.77426600 0.43938444 0.54862081 0.58575513 0.15886492
+0.73214332 0.11649057 0.77463977 0.85788827 0.17061997
+0.66838056 0.96076133 0.07949296 0.68521946 0.89986254
+0.05667410 0.12741385 0.83470977 0.63969104 0.46612929
+0.10200126 0.01194925 0.10476340 0.90285217 0.31221221
+0.32980614 0.46041971 0.52024973 0.05425470 0.28330912
+0.60426543 0.00598243 0.97244013 0.21135841 0.78561597
+0.78428734 0.63422849 0.32909934 0.44771136 0.27380750
+0.14966697 0.18156268 0.65686758 0.28726350 0.97074787
+0.63676171 0.96649494 0.24526295 0.08297372 0.54257548
+0.03166785 0.33735355 0.15946671 0.02102971 0.46228045
+0.11892296 0.33408336 0.29875681 0.29847692 0.73767569
+0.02080745 0.62980060 0.08082293 0.22993106 0.25031439
+0.87787525 0.45150053 0.13673441 0.63407612 0.97907688
+0.52241942 0.50580158 0.06273902 0.05270283 0.77031811
+0.05113352 0.24393329 0.75036441 0.37436336 0.22877652
+0.59975358 0.85707591 0.88691457 0.85547165 0.36641027
+0.58720133 0.45462835 0.09243817 0.32981586 0.07820411
+0.25421519 0.36004706 0.60092307 0.46192412 0.36758683
+0.98424170 0.08019934 0.68594024 0.45826386 0.29962317
+0.79365413 0.89231296 0.49478547 0.87645944 0.23590734
+0.28106737 0.75026285 0.08136314 0.79582424 0.76010628
+0.82792971 0.27947652 0.72482861 0.82191216 0.46171689
+0.79189752 0.96043686 0.51609668 0.88995725 0.28998963
+0.55191845 0.03934737 0.83033700 0.49553013 0.98009549
+0.19017594 0.98347750 0.33452066 0.87144372 0.72106301
+0.71272114 0.71465963 0.88361677 0.85571283 0.73782329
+0.20920458 0.34855153 0.46766817 0.02780062 0.74898344
+0.03680650 0.44866557 0.77426312 0.91025891 0.25195236
+0.87319953 0.63265037 0.25552148 0.27422476 0.95217406
+0.39281839 0.66441573 0.09158900 0.94515992 0.07800798
+0.02507888 0.39901462 0.17382573 0.12141278 0.85502334
+0.19902911 0.02160210 0.44460522 0.14688742 0.68020336
+0.71323733 0.60922473 0.95400380 0.99611159 0.90897777
+0.41073520 0.66206647 0.32064685 0.62805003 0.50677209
+0.52690101 0.87473387 0.73918362 0.39826974 0.43683919
+0.80459118 0.32422684 0.01958019 0.95319576 0.98326137
+0.83931735 0.69060863 0.33671416 0.68062550 0.65152380
+0.33392969 0.03451730 0.95227244 0.68200635 0.85074171
+0.64721009 0.51234433 0.73402047 0.00969637 0.93835057
+0.80803854 0.31485260 0.20089527 0.01323282 0.59933780
+0.31584602 0.20209563 0.33754800 0.68604181 0.24443049
+0.19952227 0.78162632 0.10336988 0.11360736 0.23536740
+0.23262256 0.67803776 0.48749791 0.74658435 0.92156640
+0.56706407 0.36683221 0.99157136 0.23421374 0.45183767
+0.91609720 0.85573315 0.37706276 0.77042618 0.30891908
+0.40709595 0.06944866 0.61342849 0.88817388 0.58734506
+0.98711323 0.14744128 0.63242656 0.87704136 0.68347125
+0.84446569 0.43265239 0.25146321 0.04130111 0.34259839
+0.92697368 0.40878778 0.56990338 0.76204273 0.19820348
+0.66314909 0.02482844 0.06669207 0.50205581 0.26084093
+0.65139159 0.41650223 0.09733904 0.56344203 0.62651696
+0.67332139 0.58037374 0.47258086 0.21010758 0.05713135
+0.89390629 0.10781246 0.32037450 0.07628388 0.34227964
+0.42190597 0.58201860 0.77363549 0.49595133 0.86031236
+0.83906769 0.81098161 0.26694195 0.14215941 0.88210306
+0.53634237 0.12090720 0.82480459 0.75930318 0.31847147
+0.92768077 0.01037616 0.56201727 0.88107122 0.35925856
+0.85860762 0.61109408 0.70408301 0.58434977 0.92192494
+0.62667915 0.75988365 0.06858761 0.36156496 0.58057195
+0.13636150 0.57719713 0.59340255 0.63530602 0.22976282
+0.71915530 0.41162531 0.63979565 0.09931342 0.79344045
+0.10893790 0.84450224 0.23122236 0.99485593 0.73637397
+0.17276368 0.13357764 0.74965804 0.64991737 0.61990341
+0.41523170 0.05878239 0.05687301 0.05497131 0.42868366
+0.42571090 0.25810502 0.89642955 0.30439758 0.39310223
+0.11357431 0.04288255 0.23397550 0.11200634 0.85621396
+0.89733974 0.37508865 0.42077265 0.68597384 0.72781399
+0.19296476 0.61699087 0.31667128 0.67756410 0.00177323
+0.05725176 0.79474693 0.18885238 0.06724856 0.68193156
+0.42202167 0.22082041 0.28554673 0.64995708 0.87851940
+0.29124547 0.61009521 0.87374537 0.05743712 0.69902994
+0.81925115 0.45653873 0.37236821 0.31118709 0.52734307
+0.39672836 0.38185294 0.30163915 0.17374510 0.04913278
+0.90404879 0.25742801 0.58266467 0.97663209 0.79823377
+0.36437958 0.15206043 0.26529938 0.22690047 0.05839021
+0.84721160 0.18622435 0.37809403 0.55706977 0.49828704
+0.47659049 0.24289680 0.88477595 0.07807463 0.56245739
+0.73490635 0.21099431 0.13164942 0.75840044 0.66877037
+0.28988183 0.44046090 0.24967434 0.80048356 0.26029740
+0.30416821 0.64151867 0.52067892 0.12880774 0.85465381
+0.02690525 0.19149288 0.49630295 0.79682619 0.43566145
+0.00288078 0.81484193 0.03763639 0.68529083 0.01339574
+0.38405386 0.30537067 0.22994703 0.44000045 0.27217985
+0.53831243 0.02870435 0.86282045 0.61831306 0.09164956
+0.25609707 0.07445781 0.72185784 0.90058883 0.30070608
+0.94476583 0.56822213 0.21933909 0.96772793 0.80063440
+0.26307906 0.31183306 0.16501252 0.55436179 0.68562285
+0.23829083 0.86511559 0.57868991 0.81888344 0.20126869
+0.93172350 0.66028129 0.21786948 0.78515828 0.10262106
+0.35390326 0.79303876 0.63427924 0.90479631 0.31024934
+0.60635447 0.56198079 0.63573813 0.91854197 0.99701497
+0.83085849 0.31692291 0.01925964 0.97446405 0.98751283
+0.60944293 0.13751018 0.69519957 0.68956636 0.56969015
+0.46440193 0.88341765 0.36754434 0.89223647 0.39786427
+0.85055280 0.12749961 0.79452122 0.89449784 0.14567830
+0.45716830 0.74822309 0.28200437 0.42546044 0.17464886
+0.68308746 0.65496587 0.52935411 0.12736159 0.61523955
+0.81590528 0.63107864 0.39786553 0.20102294 0.53292914
+0.75485590 0.59847044 0.32861691 0.12125866 0.58917183
+0.07638293 0.86845380 0.29192617 0.03989733 0.52180460
+0.32503407 0.64071852 0.69516575 0.74254998 0.54587026
+0.48713246 0.32920155 0.08719954 0.63497059 0.54328459
+0.64178757 0.45583809 0.70694291 0.85212760 0.86074305
+0.33163422 0.85739792 0.59908488 0.74566046 0.72157152
+0x67405e6c328fecdf 0x3c8b2c35482ec8c9 0x3250533bca1940c7
+0xf2d983e5b3262520 0xe5b759c591be1fda 0x8242a4458c0654ef
+0xb04d83e5cb5b6017 0xb6ad8ae702c9d964 0xbcf18ae96331a2da
+0x1cc1d152497d4674 0xc89cf1de59189442 0x398b33c171e4c16c
+0xe1ef8b20e9581f1f 0xbcf3922d01c3c4c3 0x3fb925af371e20d8
+0x3788696c8a091e68 0x98a8edcd8a199268 0x2b2bf18b86a1d357
+0x474017009e18d034 0x0f5914833849cde7 0x5f04574352379c0d
+0x8f5ca9b0d749b8d0 0x75b973eb6c039cde 0x69d4a24d0386aca4
+0xfe82fe22f8c3715d 0x007e2b70611c98e7 0xf45e29c72b9f1786
+0x7694fd07e82e529d 0x0d374894c5b55c9c 0xc8e6005052a38ac7
+0xdafb054cec6083d7 0x625a22c66cd3bf85 0xc0af2ff40f2b0074
+0xd6489630d188c4c3 0x7f034dbaf566f42b 0xfa47383d871e8dae
+0x1e9bc6524bbc99df 0xc97e66d2eef0793f 0x45760d27aade8dc5
+0xd9a5a1454582602e 0xb340cc9b522bb2b1 0xc449fec5c8359c3b
+0xdba9d20c2b809802 0xe414bdd2089cd4a9 0x714def09cabd0d3d
+0xf9755125bdca9539 0xa5d9bfb951aed29c 0x5c7e2d33c83ddf92
+0x8a59ea07850aa835 0x401f067db97b5427 0xefe4b17bb713b9c7
+0xee193cc8a0f16596 0x0c6b1f2a9ea778e5 0xbcacdc1567f07bef
+0x713ddb4b58672888 0xee8075374182b161 0x03ec2941a3da86d6
+0x675aebbee50e893c 0xd225931684eb5291 0xe477c127c0a105e7
+0xefcf3d05ef772a45 0xc4ef9941734f83b1 0x87e4942af85e11e8
+0xbf5fcadc377de765 0x079d9de562268b5f 0xea04faeba98f3e5c
+0xb930c6da6ddba5f7 0x478236b93821b9ba 0xa9cfc9be294b0ff7
+0x6079f977b05fb70e 0xe9e2dbe839ffb6e8 0xb3170798eddd9379
+0x0ef96e0ec793a92f 0x0524e13897de842d 0x465b56ac3b31cdda
+0x890bd07a90444c77 0xe234eff01af4dfd2 0xe84848232dbbfdd3
+0x3c4efcfca0456b81 0x670edacde7eeec34 0x2b36fde828c7b1ec
+0x9b6da65be00fdbd5 0xa3c761de25e1f4ba 0x7377fd171c85c139
+0x1b128eb2dad95dd2 0x3537b8e218bd4ec7 0xa0101eb1b3e29a86
+0x528f6b866eb20175 0x5473da9172d03fbd 0x9c77153ec299763c
+0x47ba00f901873acb 0x137df82e07c009b0 0x3c61915b065d0aa6
+0xe49e7299af8e6c5c 0x257f9436560d2208 0x78b7d974999c611a
+0x0d8adaad822c4965 0xa20c4d15c9573034 0x6f95460c6e9d822d
+0x2f853e287ea74d01 0xab53596fa5dc9c89 0x527171fc57868fc3
+0xab0a91e3a225f47f 0x5fe417b6624ce303 0x8ae42059c66c39dd
+0xb60de6ddff69c310 0xc83f24d03dbfe608 0xc5165efd988fd170
+0x12c3eb7ad4274929 0x1f8cee33476428d2 0x1ddea9364463c3ff
+0x1667501fcad4dedb 0x62c7365a9bb30ad9 0x8899b942b7573303
+0xc887f6d9377ee390 0x402af03474f37acc 0x9da72866ac0d1cc5
+0xa986abc55d6af4f4 0xe8a71cb1151dbf43 0x0ec056bba0bc3215
+0xda14a58e848fcf79 0x491f5bd20198aa9b 0xf42da0476fcd8832
+0x627c1873be8fb51e 0xa454d9b0317b861a 0x40fc96f80cd3bbc0
+0x9e2ff393daf483cb 0xfc758a65c59b083e 0xe859fd51d3a08556
+0x5efef07d19820724 0xe279ece74937813f 0x110eb92574a2084e
+0x3df42d82245bff2b 0x9ca473630575feb8 0x15c70084fb66d585
+0xbee7c870394194f7 0x9cc0a4f51f369867 0x6afbf2cb6ec861cf
+0xa5b91a04835cae3a 0x7ddc222f0834f7eb 0xa9dfb4a5b06e5b6b
+0xb5810c3cc04cd424 0xc868b4afb4e9dacc 0x42c02254bac2d4d8
+0xf0a9eece4466ef85 0x4871a4148a47ea38 0x12576ffdb6fdad9b
+0x0e9505d10fc16302 0x5631bf2c70323f0e 0x28b179d03e01feab
+0x9dc0dd12b00dac9b 0x8cf7134a0d2f5d0e 0xeb08c68420fa5985
+0x70c0a74d1d12661a 0xc66142602dd41863 0xb20965edb4a3adb9
+0x319530ddf3890df6 0x5b78a9a93cbc723c 0x4cb9b467b0b55068
+0x8542da469019ce95 0xc091429ae362663a 0x9ef9dc529dccab6e
+0x96da56d02c5e2eda 0xbd065629a65356b8 0x86ab31cf8072eca6
+0xc95dff79f3ee122c 0x9044db614b167618 0x2b53b3ada5a7095d
+0x039b2a4c565ecffa 0xebe357d7d10d5f2d 0x64b694ddcdffd60a
+0x2f1a1613ac6c9f3d 0xc79da221329d4ef9 0x2bdb226b6cd38813
+0xca9767b9d3a2b084 0x66f6512c1f7dc7fd 0x91a73f962a2f07e0
+0x885bbddc2ff3debf 0xbd67554077fd4c70 0x99013399c758bd15
+0x5cf5b0f828128900 0x210a832780dbd783 0x9e3a38661b53eda9
+0xaae6b882c58412c3 0x2082626df4ca895b 0x5cf5f76bb7b48682
+0x6c0cae2d715e434e 0xef16816391c305dc 0x87b979af394695e4
+0xcd2a272a1a805492 0x60f7e95280c303c4 0x321c98d44c5f4e05
+0xadf6dce5480a0aa2 0x72c9e5996c3339ad 0xbf1d29e6f0b41a0d
+0x274f3d2dd5f0c37e 0x914b57f6ef5243e9 0xed1817876768b412
+0x05b4432aa26c8866 0x87ca3f2c01af6814 0x03116c1030ac3ab8
+0xf5cacecb81781b7c 0x131f0c45877769e0 0x3eabeb0bd69e2872
+0x7b276bc3d7ecbd7f 0xccb9143a415b647d 0xe84d77e693e184c0
+0x780d77c885a12891 0xcbfefe34f9a928ce 0xd5c2487c43c47678
+0x2eb49b0cff9b2b0c 0xd31248757f950b80 0xac0f5e6b333b69f3
+0x3bc0db5d4cd64c13 0x6e7b83ef32960445 0xa503015d8f6deb6c
+0x5b8daa355e155964 0x2f734ee67c567191 0xedbf27328a640c45
+0xa7d67ea4920db6eb 0x581d6a00e15dd86f 0xc50a59f87c672f01
+0x6bbb37d607050b9c 0xbe1ed4bf962c3f9c 0xb9e18300cc6a0292
+0xbaf3a963ebf66bb1 0xcc4a11a37f3f042c 0x78c18be41d2173a5
+0xd624936215d43e75 0x123cf1fcbd9a720f 0xebd61ce176c98627
+0x78f872dffb1a12cc 0x58e45fdbe9434f31 0x1fb28683306e8c21
+0x994bdbccb69dbb36 0xca605fc87b2b873a 0xfdef6f060b8f4c19
+0xbf7b96a4d8039aa5 0xcd017cc08e0ef146 0x109f34ce10506ff9
+0x79a06caf3d6c91c8 0x2ed9f6c0e43a7a0a 0xda890a93647404d9
+0x8f495cdc9571b6db 0xbceda30aec45acf5 0xe2f8e38ec63c6eb3
+0x86637eb61e70775c 0x656a79f5182dcd27 0x0d6ac797d29627af
+0xcfa36f230d63e8f5 0x0fe19f20f9309ac4 0x069fb30d45ab4af4
+0xb5ba378ef4a205b2 0xe51b1280dc04e530 0x6efd94972a8be0a7
+0x334fc5b584a1bad7 0xbdd5a374520f4a9b 0x706b706b5bc348c8
+0x836dc7d17c04caac 0x356ddab9d49ddfe6 0xd159ed5840ddd0ae
+0xadac31b4fef8e091 0x8d89032b3845859f 0x3bd971e2a8cadd2c
+0x65304014021337bf 0xb7d974ac484afeaf 0xdbc7da0068d2f636
+0x16eb1156c1b04b4d 0x8616a5dea534ad0e 0x67f74216c5383c94
+0x66b2b1aa56ad4087 0x29c2484d55c3489d 0x150215c70d49f395
+0xf9d5c84babe4b95c 0x5190562945d45db4 0x422706f447c8cb26
+0xd836deaaa1c2de48 0x422d6a90be18aada 0x47be7c3ce1d2d478
+0xf67eca6f67bb3775 0x93b171e4234cfe76 0x284d0008372ef07c
+0x3698d967c82e6cf2 0xe2f6a86325a83871 0xa05b6bb8d5a8f19d
+0x8c3c8c0932b47315 0xcb17cef7ae59b502 0x03cfa20ca46fa2b2
+0xe6b973298c8e9cf9 0x3dc8b542ce244d7c 0x24e151f91a603c98
+0xbabc1fe6edf224d0 0x03057b07c7da7488 0x9893f09f3a05d9d4
+0x8d7c9ede4bbb4625 0x0a3e483d53be86ca 0xaeaa80fa118aa1be
+0x5db21e05baa52d1a 0x7e6646013c3c76bd 0x6e02586aab7c0b75
+0xc599811f3381e84e 0x8584e334f66cfd11 0x2ddab189180628de
+0x798e7628ad4a602b 0xfb4dd579277e2008 0x6600c85ce8ec6a2e
+0x0aa55f130589b854 0xe45ab335eac329a3 0xa5325407038e4e11
+0xcf996346624b521c 0x4d0274a34c9589bd 0x8d0cad0b87392d57
+0x2d9646a64bcd2d2f 0xad7a1ec0ae4a2151 0x3f376577014f8dca
+0x19deb4a855f350e3 0x4998df888b984424 0xceebb951fbc934ca
+0x506ef0f671ceb9c1 0xc5ef09726ea704bf 0x5189b998448a5e11
+0x675f92fbc1a4c11e 0x530c4265087a7c4a 0xe18e8069e8b707e1
+0x3c2779074e4a00a2 0x95370af830c9dc3a 0x416b062811f2175f
+0x1fa8fe3935e37c73 0xde540bcb853484b0 0xcb601ed072199738
+0x83c2ac01d2ca8f62 0x64f81e49fdcaf6a7 0xb7e5e765429cee4c
+0xcec327d993653b95 0xd21eb799eac60217 0x3d1f583dcea5cb99
+0x2f780a3c356d4f5d 0x0d25a447cc72a00c 0x333fcbb7b46e275c
+0xd30674469bf820aa 0x434434c3882a96aa 0x3c8b8d389fa5647b
+0xa1af5ecc309ff07c 0x39e85842caf91bcc 0x383eac8cbb8c7b53
+0xddc66b6b38b1f273 0x7b0e539b24202231 0xf65bd3d8c61484a8
+0x8ffa69be80f9b321 0x22b46595c09272e4 0x5393806eb788160e
+0x891be7c411c10b15 0x9278dccf0fe4813b 0x86b245936eaa77d7
+0x4a6b28b10796ab38 0x1ea2db227acc5ae6 0x7b54f85812e5900b
+0xe5f912551ddf84a8 0x0f4fea8cd744faf7 0x35aa34911651bab2
+0x3302457b0da9f478 0x7e8980bb045d7552 0xb17000de13ee8a17
+0x15eef5ca5c5209d4 0x776f92bce4b4bf98 0xb8176052fe1587e7
+0x9437869fefbbe7ea 0xeee20c003aea2c86 0xe2c48da4fdc5426a
+0x084429586e8a01ac 0x51156e0189a6be85 0x48fa8721bf77762c
+0x2b0b33a3c4fc2e60 0x57d068040d700132 0xdb1dbc6158eb9ea0
+0xc097e39e74ba26af 0x4a1c702bd0ab338b 0xc64fb41bb2e7242a
+0x397ce664cd1bd8b9 0x10998a8b432e89d5 0x6641eb4947f78de5
+0x90b623890c63938b 0xb70c700b9de39c7d 0x28bdd272e47f9e88
+0x9539798ec4e3ea34 0x3ef0763c6f6c3108 0x26ac5d6724e40a1b
+0x6195a673243088f6 0xdb5aba293e070ca4 0x98800aea4e5f0e4e
+0x82c4134187537bc8 0x4ba979d8911b8e0e 0x943e41ad3cd01884
+0x88e65d80db067bd7 0x7869b39765369333 0xaf50316f4c694b29
+0xc451835f04a402e0 0x60dcb558ee7e6e2b 0x65234e499ff85e2a
+0xe3dc8b2a11bd5edb 0x2a21e5d894243627 0x3ed8d69bd926c4cc
+0x15bf0f6060684012 0x50ce7ee65b3c64a6 0x11cf02e2af1452d3
+0xd8c84781d09f2c5c 0xd0bcaa5424f39d22 0xb251707659f54392
+0x2bce134f4cc2db80 0xac8d00764ecd9b1e 0xd39e3d7e7c50dcd9
+0x3bc9595caaa45365 0x4a8f4c6e9fb55bfe 0x3acabfc0bbaeab42
+0xe8dc0d62f62fce39 0x87db2dfde390bb6b 0x21733320e5683477
+0x382b5edf45b76468 0x0942a6f469259049 0x69d0104109910df2
+0xcd97e90a0b92882f 0xccbc893007f0ce9e 0xc7f15af4b343b733
+0x8452042dbf83c7b6 0x4cd332f6dcfcf88c 0x2af26987bfce6905
+0xe906963392926ca5 0xe497fc1e943fa9f2 0x98a7949ac41b7f42
+0x468f4c6bd9e09993 0xb7aa4263eacbfa1c 0x1046dafd21831081
+0xdce0a46e7f40131a 0x02fb82dfbc2bc81f 0x24fb0ae41a50b4ce
+0xb5f3c95c66a0eb55 0x4f6cb9ad702e23eb 0x53de1d08a184b063
+0x6ec2daf2a3b654ad 0x0e1dce6a1f2683d8 0xfde0cc4dc717cb77
+0x15ab25c3e644d3b3 0x4e4b23569f93d996 0xa081f0ade3ca6df5
+0x49427fdef4f200ee 0xcf635cbfae34ebe2 0xd898e8639bb74469
+0x4a86c97e8ee0654d 0x108f1bd0715a571f 0x47513cd35aecd66c
+0xb9c0324084cdcc91 0x53b651c7d391dd3c 0x3752b5626ec39e99
+0xb4b374a39bea99bb 0xa21b1a5d19d2a41c 0x8276063bed6548c5
+0x71bc6f337c67476c 0x9c40e276168f4a94 0x4b445b3a5789d7c2
+0xaf355b6b8f0c93cb 0xb4f15704ffb1c015 0x9aebd68bc4601ee9
+0x7274be877312d407 0x1a8eda0149fb3c1e 0xac96a19dbddb1f43
+0x289d606d06643c34 0x16dc1e37a3f591b5 0xe67f91eac997ba12
+0x545f62407cfe3487 0x9ad438a940a78009 0x5cbc0cff8ea73089
+0xe21a9f995af5be74 0x85802cfeb24a44d6 0xf31f35e62429f5cb
+0x47b870132f0f2527 0x3bef8c28c62b7933 0xf9862f1954334aaa
+0x6142043b9d7b8b81 0xc78a907965972287 0xe14b1d2a8d083d05
+0xb1a871510b4f3f75 0x59a00d2aac09a4ef 0x7c97edc49af98314
+0xdd4aab86f1152b2b 0x88bd0a6cd1b0307d 0xfdefc6d97ed5c95a
+0xe28c7cca7215417d 0x7d2120fe52be9b7f 0xa02e5ee452ffad28
+0x8dab828a23b62644 0x25b3328373eb47de 0xde0169b28c909cf4
+0xb9b3c51169a7a836 0x4541eb9d97eea0d6 0x1066c5b6d33156b5
+0xaf4e1758f7c6645f 0xf436e47c3b2de674 0xd8debdd7895e6d80
+0x615d5dded5a0383b 0x4834c108bc72e7b0 0x5ac8150a4d88bf44
+0xff249baacfc83e7a 0x1adbe96930658b73 0x0aa6f579acb30710
+0x30ec7277223022a2 0x39aa3d32743f07b4 0x8b822e40dfbcefa9
+0x8119c3984a6d6fc4 0x228746c8f461f4ca 0xa7588248e15d4fb6
+0xa900b1fe9351661d 0xfd6ee310a38cfc88 0x5cc2716dde3a6486
+0x2a430535315c47c6 0xec8f741aa91d097a 0x325170f292f4d4b7
+0xf71ad5c55a97aafa 0x3051eca81bdefbdc 0xc55f411439234e97
+0x10fe11b12e91e37a 0xf081df2576233eec 0x9eb8940ca99473c9
+0x5268bb96f025f00d 0x0281ee36f301d7ed 0xa0dfe16d6dafba1a
+0x2c1414da9a9b5d5a 0x9c44677222d40889 0xae4747a3134dd86d
+0x2d8892a98f291495 0xfb877e659c3ba71c 0xddf2cd76c64c3eae
+0x86998597390f3524 0xed28981f85bb3386 0x55b153bfec5beeac
+0x070f3b6614925bee 0x4eb1f4056f2ffa7f 0x78c6fd1608716443
+0x9a7662241ec5ee22 0x271ec60c7e87c370 0xf06fd9db9054c3d2
+0xa2723d1d26721f97 0xb078f617085673e5 0xdc78f1d0abc15908
+0x943849291f25e178 0xbae22ce64af1206e 0x11c78544941df857
+0x770ade0044d5f1fc 0xbc2878e716899306 0x94790cb9dea053b5
+0x6bbde24988ce4f85 0xfeb8786c8d39aa85 0x1f97f351d463dcf9
+0x7b4e291afc0e51d5 0x1faae3799dba6603 0x25815979ba54f03b
+0xb02a9b623c9400f0 0xc4a07da834cbf427 0x8f5cc1076ea09031
+0x2dea3ec81ab08515 0xe98683d1fb2b46e0 0xd03c1f8e1d3ab77d
+0xf2c2c6d3e5f609ab 0x2a4de6d7e059318d 0x5ade1e2f78a73769
+0x2662dcf712c0a5f9 0x14207cc1750e2a61 0xa02d796041e7f4c5
+0x996adc6d965c2445 0x5dd4cb5847526843 0x0612d104e4e52c29
+0x627c637c39b6587a 0xc04c4f4dc793d508 0xdd6cc43e981b9b46
+0xec8c58354fba1cfb 0xbc57a1e2281000bf 0x856fe5ec82652f66
+0x2350e203ab340e42 0xb74978ac55eb75fa 0x844067a42c59d22a
+0x5bb67e9025bcb14b 0xc6882f42a876775d 0x537deec5107af383
+0xa1b9e0408bb030ae 0x96f5422b40d40266 0xe6cd16792e1a8f4f
+0xd0994934ced6fb04 0x48924c4026066397 0xe6554877dbefffdc
+0x8acdae02c1b72b73 0x7002431784308714 0xec6113b8ebf9216d
+0xccd7d92eb94a654d 0x55bf89d61ea45655 0x40dfe822e6fc1ea4
+0xd871edcfa00e4eae 0x2b162c2c595b2ce0 0xfe78e58f4fa84c74
+0x8761656ac9573dc6 0xfb1c05d5c05ad042 0xcd6868216c342eae
+0x438466743ae36016 0x090c7ae0a96ce094 0x84c585a68e7a3fd3
+0x0f6b9eec0252f718 0x8144d103959c9cc8 0x9228d0f9530fe13e
+0x8182289a02b0f3b8 0xc71701f3dc02224e 0xd4fea5e16b0d5abc
+0x1bf89249443455f9 0x4d55abb33ea80876 0x30983ba4fc324bb0
+0xbdbe2ca26b4c48f5 0x110c8c2a0ce2403f 0x28fb71fe1421a58e
+0x897ca36224a0aeeb 0x873de78eae3414fa 0x5fc8368456d0a9a5
+0x730c72bf20e283f8 0x7582ea92afd96933 0x109dc798de62815f
+0x1334281c26f62ce5 0xc7351153d0631deb 0x0c7023b82da633bf
+0xab1f57a9402eb30f 0xbb43ef1a4abfed2f 0xbb13ff409efeb1fc
+0x16c77c66e400694e 0x5426e09c1deea6a8 0x79166753b5237b34
+0x4dac916d35ab84db 0xb2ddcede1bc5dccf 0x2ca000aed2faada6
+0x07b93902e0f10ca3 0xb181252e99021c2e 0x2a7cbcd596a23023
+0x1d135b910a22e3cd 0x50cac25266319226 0xd58ef5433e09ed3d
+0x2ba2a95166295246 0x104b0b90d7f54fcb 0x581e62e6e1effb88
+0x3b45c52fb3a61216 0x9b49976a6f98f4eb 0x5ecfaa723c68195f
+0x8bbec72e20caaf68 0xc6bcc3dd73e2ad16 0xeba0ca03b8cb6ce6
+0x77ea36fa204a7dbc 0x032d39bedbe56a62 0xf602ba75320469bb
+0x75ab379ae76f35d0 0xde954c6d2bc62abc 0xc67dc6587d5cb845
+0x6dd3c792d70f1d9f 0xda648f505baf81a4 0x6db4fb04bf05696b
+0x171a2898c06495a2 0x70f7573328116d96 0xa1b113a37b65e83b
+0xb7dc61716efbe386 0x44ff43b2d0f72216 0xb7572d3ef3679377
+0x01aa46678e35f96e 0x4793e9cefb9c00c5 0xb1b1cef3fa3e69da
+0xd6d5ce0496318fda 0x69d94c713ea5f8ff 0xb472156b44138e70
+0xa02166bb98773ebb 0xb91af940dff04058 0x9ceb3ab0aa02a266
+0xe2100434f0fc2e43 0xb80cdd4bbf69e7d3 0xe7c922fe9b3e7dfd
+0x3efad5419796a29b 0x87066640dc12ad77 0x395836ee2932bf23
+0x8e31b88ecced3d2e 0x4961ec2d6c7a75bf 0x6cd4f08d12c54e2b
+0x9f4dffcf1ddaa230 0xde123965919adba9 0x85e8c4d2146b9a09
+0x9e08deb3621df068 0xe0fd112de4b1269a 0xce27bb4d2a13cc8f
+0x32da8b6a6be75911 0x74e5dc1240d07647 0x99f0a90a8e1744c6
+0x51f94d8711994a47 0x690ca685dce349e9 0xda0c25b35ec3c056
+0x1ddea693f42c7d0f 0x23ab3c219c040475 0x3117b89a22075ecf
+0x8b729c188838cc11 0xf70c2e398725c02c 0xb5bf35fe12a37678
+0x1f0d26466bf81f25 0x20035f8d7e67489e 0x536e07502c93e26d
+0x0289b3b250176742 0xf0808a555109f37a 0x57ad69a0b71ab441
+0x1c81fa7d03800bdf 0xdf4febd9d519ccce 0x3b197c4de921087e
+0xda7c01969094f8f2 0x1bc17b9c04f6c5a5 0x89e4e1442e1d96d2
+0x05ae8ea263a5551a 0x6a01bac34ab09948 0x0b3e4f6e4a88c0f0
+0x4ecd511935dc55fe 0xa86534eb3311fa12 0x5b30a6c3a1e4141f
+0xa7cb6e8360649ab6 0xac9f884e6cfdacf4 0xf91bedc58f459fbc
+0x0fc06df57546cd8d 0xbe67f7c136080899 0x56abb03af17d6abf
+0x724d1581df36e444 0x6d348658a786cf13 0x15657b71facd5394
+0x870833f43ea4f44c 0x0c6b401a4ac5a9cf 0xac11311d9fdc1be0
+0x476db893090dbf50 0x6413363a40a0cc4d 0x656747030824b882
+0xc7763af198a3a915 0xc75b4c22db7f00e4 0xae0908746889482f
+0x105715af45acd194 0x484b10f834c570c8 0x29ac2c4625e5ebe7
+0x1ceda07f07739b02 0x25b8f899688c1e3e 0x9dffd257982a8139
+0x6ab245d75fd640ad 0xfd4a57561629ffa3 0xc6d726b0652e522f
+0xbcee4526af1030dd 0x43ea89479729db21 0xf0293df6e9672dec
+0xd4bc658e845060e1 0xd3a76977a87cc56a 0x3e9d83e4f81d87ba
+0x37f417d10048bc8d 0x14813d8b4553c5ae 0x574426b7bcdf149a
+0x9f791183f3df9a17 0xb155609334da4198 0xb4c3a5805d905e94
+0x12be857025a4a3eb 0x987445ae6fa8f849 0x428543effdb537ca
+0x05184d6f0c8af720 0x5f5a26958fc4f110 0x327feea9b8040719
+0x20f1368a9c267469 0x142bd97ddce3145f 0xb6dde259f3962e5b
+0xa5b7dfcf9c74d178 0x4a18aeb68cb7ee39 0xb7cb562ea710998c
+0x192c6a2440aa4799 0x262f549ff36ca087 0x00f1af86f7765419
+0x4fcaee4d2099f234 0xcb4823b5e460d68f 0xb8739346ee15f7c7
+0xe43f60dfcb722680 0xe588a07930826597 0xf11726b1fc2109e9
+0x2424847c52585435 0x594ca4d5eeaee507 0x960a8b2c5b3f2315
+0xf967389af1bc6066 0x9f5e1091a412689b 0x62c846564646bc24
+0x700fc081c9ad8622 0x054e53d585fed025 0x8393936ba2f979b0
+0xb7c891fc56a0e600 0x2055bd49358e42d3 0x10cc9952ec1e0244
+0x5dec53aadd0f64cf 0x5e2b688880e60374 0x26e5823522a3d1f7
+0x59263a46b0645a04 0xf8ea1b64930f2e4b 0x36e9fd75c4e84705
+0x4d196437dc1e9d07 0xc5c13dbebb428a60 0x0dcfbb9e0dd9357a
+0x7431271fe51ae20f 0x7578310058c9bc41 0xedab646a7a08e4e7
+0x1f098e77ac40fb45 0xdeb6b54c5f31ee6a 0x7d4953f9398fdf64
+0x9f28504dddbf678f 0xe3a56dee1a79e9f0 0x5cc098e3879b9e87
+0xc0b3e036a7160df6 0x345202b60f7d5fb9 0xde5f13f2f373b5d5
+0x09be9e199ea0e9d9 0x750064ba7fef6ab6 0x8393d8eed970a861
+0x90b992785db1e2ce 0x66a200d30926cb63 0x08d2a52a1be49bcf
+0xb1d396ed4bd1e92d 0x0067b3f585de736c 0x3e203ee543d7bc8e
+0x7f660ac01586461f 0x2ef0b714fe7da812 0xd02030c778ab097c
+0xbe9b971cd60bc342 0xb1cf6f572000cd53 0x7090e8f533244d92
+0x4b3d3eb42bc0616f 0x4abcb0dc9345cec3 0x3315735b2eaa1dc1
+0x1968b53c2948c9fe 0x6f832e2ab85716fb 0xf680b4690f452fe7
+0x35634e189261c27a 0xb1655320b6881a39 0x3aa4f712d8d74eb6
+0xe7f8db6934680789 0x31a395a5ef322e71 0x9e8108f1e7bfbf9b
+0x32dc1fc02c568ef1 0x1c7e39ba8e98e717 0x98bb5d9f1719de9c
+0x64fad9fd4e9e04ac 0xc43c4ef84c1a749a 0xf172bfdcb082ae79
+0x775944b64825cc94 0x6752a18cc5cbd881 0x0bcd3e25a4cd6344
+0xdc00a7d88f1be5a6 0xfdf241d4b98b15c7 0x6ffccf1d3347e63e
+0xb4985dd04e81f326 0x798f4cfb661bbd4b 0xa2013d7415eb3df7
+0x879a90fdc3315936 0xdf037725e8829def 0x3eee6d747de55fd1
+0x1950ae14c16199ba 0xf32d6b0bbb81943f 0x6b813e655734bbf8
+0x63f5368932cfe7b8 0xf4ebf4ca577ec930 0xe7a5ec5bab21afba
+0x804feff47c04c4b7 0x2e3da596c444bf41 0x21be1d62146c81da
+0xfaa880de93886ac0 0x88987a63ea1750e6 0xce299bb9e0c40dd8
+0x6d2d3a48162f4f0d 0xfb873ecc6261d540 0x665184a16cbfeca9
+0x99f08162522947e1 0xb678afe1ae26f80e 0x81438967c30cec52
+0x7b84c7088847f470 0xdf8e250fe4392e3d 0xde420f611e212a49
+0x247e7a9bc296553f 0xcda4431f6214d257 0x0fe8ce1a5e7264ac
+0x7e2e9d87db4e18fc 0x12e9b0a508d5e4e0 0xdef99c379a602fc6
+0x772b91fb215e3f6c 0x29bb880dcd669c11 0x604e2a3e80d1a980
+0x62db806e1dacccb7 0x5a9925e5d386b369 0x464807efe2c001cb
+0x0681067ba9e69114 0xc4d7c8c2a7123d6a 0x4a3285f39878a215
+0x7432ddf4653a9cb8 0xd007940d70c24b13 0x5608cb9f87571fc8
+0x7a2c5b232b41ebdd 0x99245a3f8c434fd8 0x8acbdd231982f5a2
+0xfa50e2c5460ba07f 0xb2b5383501d97388 0x91722d48b0a05a3e
+0xe92bd4a4ad9bd471 0xf8b226909751d1ac 0x3a84feeb4efe53ea
+0x1bf2c0769fe54fae 0x1f06a43bf2b2bb23 0x6d89b57008409736
+0x68d2563f7fc1319a 0x4cca7c28306c60d5 0x45532d245acdef1b
+0xda535f9dec96bbab 0x25451d82b9aa12b3 0xc2a354b5d8c63228
+0x1c1f97d0851becc0 0x9324fa1d5e1b4a44 0xdb312686295300fa
+0x92e8ee2945f76afc 0x102b3df2dea6e8b2 0x309d7a9e07174ebf
+0xa0d2445e0bebc266 0xdfb983a73c6afd26 0x52cfe364f3957e2d
+0x72b6f2ad68342515 0x1383fb2184f4ab50 0xfc50740a6dc0d7a0
+0x12c03f1b30b409a3 0x195f3359d8d3d697 0xb1a696de263a9206
+0xbfa9d96984833e72 0xbc6b844babe41595 0xd72507fee5d57c9b
+0x48ad095be6f55861 0x76942ad5903bb97c 0xd4b006bd9e4de5eb
+0x0ae6d222ed88e74c 0x37aa525e18213b95 0x3e87311c62589252
+0x9db5c12f91a6f728 0xe9dd60e4310fa419 0xe5ab03d0f0a1f978
+0xc699639ec932afb5 0xc2e90bc8cddee0ab 0xafa68bbcb8d7ddd0
+0x7a8c80def2c5c00f 0xb6caf5c351a3f9e2 0x149e219d648cff15
+0x55e3b95cfb7941c9 0x445df7728165470e 0x246e3b9dde51bb60
+0x7f47c1e9e9e105cc 0x2a78b27625fe4e3e 0xf7e647125a81184b
+0x35615c5ac93e3ee2 0xab3d5ee5f30c6542 0x66adaa81be80a255
+0xba3dee726484d20a 0x81fd51257a971385 0xb6a79592d5164e85
+0xd3333835d170e9bd 0x5debdffeff4d63b0 0xe405e71c9c7fe247
+0x64df8c009c73b84e 0x850fb06b2c70c144 0x1619917459f3cff2
+0x50738716b1e7b305 0xa361ad73caf95820 0x6cbee1e292279728
+0xd3a60974113b480f 0x1333cc2da08bd7d0 0x9e983992c88d0c05
+0xf3885e54ee8d8864 0xbb72960373ad2100 0x7fce723754efe9c5
+0x0d04185678b2aa93 0x2bd575cf4c798b50 0x03deee9bed8c238b
+0x4dd3e83aef99b423 0x49575eddb23bcd0c 0xa86925782bd40519
+0x61604398fc35ed8b 0x0030c4447d5843d7 0x97f557e742d44f66
+0xe478030253f33ed2 0xb5b8b264f2499aab 0xa66da3eef7c6a8b7
+0x48c5802dde4e9c48 0x892fe0c182a44518 0x40da78f7fc26965f
+0xef69f38439b57f01 0x41ce76b97047c1f5 0x11d6df8a7e933468
+0x0e68110350ac736e 0x4993e7a6fe133ef1 0x3ee9280e32bf67ad
+0x9dd5df301cd57953 0x1e541a2e250db81e 0x193c71118501bda9
+0xef943420618b4a08 0x88496f019fcee0bc 0xbea7b0911223aabe
diff --git a/libutil/mt19937ar/mt19937ar.readme b/libutil/mt19937ar/mt19937ar.readme
new file mode 100755
index 0000000..c3a9c41
--- /dev/null
+++ b/libutil/mt19937ar/mt19937ar.readme
@@ -0,0 +1,74 @@
+This is a Mersenne Twister pseudorandom number generator
+with period 2^19937-1 with improved initialization scheme,
+modified on 2002/1/26 by Takuji Nishimura and Makoto Matsumoto.
+
+Contents of this tar ball:
+readme-mt.txt this file
+mt19937ar.c the C source (ar: initialize by ARray)
+mt19937ar.out Test outputs of six types generators. 1000 for each
+
+1. Initialization
+ The initialization scheme for the previous versions of MT
+(e.g. 1999/10/28 version or earlier) has a tiny problem, that
+the most significant bits of the seed is not well reflected
+to the state vector of MT.
+
+This version (2002/1/26) has two initialization schemes:
+init_genrand(seed) and init_by_array(init_key, key_length).
+
+init_genrand(seed) initializes the state vector by using
+one unsigned 32-bit integer "seed", which may be zero.
+
+init_by_array(init_key, key_length) initializes the state vector
+by using an array init_key[] of unsigned 32-bit integers
+of length key_kength. If key_length is smaller than 624,
+then each array of 32-bit integers gives distinct initial
+state vector. This is useful if you want a larger seed space
+than 32-bit word.
+
+2. Generation
+After initialization, the following type of pseudorandom numbers
+are available.
+
+genrand_int32() generates unsigned 32-bit integers.
+genrand_int31() generates unsigned 31-bit integers.
+genrand_real1() generates uniform real in [0,1] (32-bit resolution).
+genrand_real2() generates uniform real in [0,1) (32-bit resolution).
+genrand_real3() generates uniform real in (0,1) (32-bit resolution).
+genrand_res53() generates uniform real in [0,1) with 53-bit resolution.
+
+Note: the last five functions call the first one.
+if you need more speed for these five functions, you may
+suppress the function call by copying genrand_int32() and
+replacing the last return(), following to these five functions.
+
+3. main()
+main() is an example to initialize with an array of length 4,
+then 1000 outputs of unsigned 32-bit integers,
+then 1000 outputs of real [0,1) numbers.
+
+4. The outputs
+The output of the mt19937ar.c is in the file mt19937ar.out.
+If you revise or translate the code, check the output
+by using this file.
+
+5. Cryptography
+This generator is not cryptoraphically secure.
+You need to use a one-way (or hash) function to obtain
+a secure random sequence.
+
+6. Correspondence
+See:
+URL http://www.math.keio.ac.jp/matumoto/emt.html
+email matumoto at math.keio.ac.jp, nisimura at sci.kj.yamagata-u.ac.jp
+
+7. Reference
+M. Matsumoto and T. Nishimura,
+"Mersenne Twister: A 623-Dimensionally Equidistributed Uniform
+Pseudo-Random Number Generator",
+ACM Transactions on Modeling and Computer Simulation,
+Vol. 8, No. 1, January 1998, pp 3--30.
+
+-------
+Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+All rights reserved.
diff --git a/libutil/mt19937ar/tt800.c b/libutil/mt19937ar/tt800.c
new file mode 100644
index 0000000..d4f110b
--- /dev/null
+++ b/libutil/mt19937ar/tt800.c
@@ -0,0 +1,64 @@
+/* http://random.mat.sbg.ac.at/ftp/pub/data/tt800.c */
+
+/* A C-program for TT800 : July 8th 1996 Version */
+/* by M. Matsumoto, email: matumoto at math.keio.ac.jp */
+/* genrand() generate one pseudorandom number with double precision */
+/* which is uniformly distributed on [0,1]-interval */
+/* for each call. One may choose any initial 25 seeds */
+/* except all zeros. */
+
+/* See: ACM Transactions on Modelling and Computer Simulation, */
+/* Vol. 4, No. 3, 1994, pages 254-266. */
+
+#include <stdio.h>
+#define N 25
+#define M 7
+
+double
+genrand()
+{
+ unsigned long y;
+ static int k = 0;
+ static unsigned long x[N]={ /* initial 25 seeds, change as you wish */
+ 0x95f24dab, 0x0b685215, 0xe76ccae7, 0xaf3ec239, 0x715fad23,
+ 0x24a590ad, 0x69e4b5ef, 0xbf456141, 0x96bc1b7b, 0xa7bdf825,
+ 0xc1de75b7, 0x8858a9c9, 0x2da87693, 0xb657f9dd, 0xffdc8a9f,
+ 0x8121da71, 0x8b823ecb, 0x885d05f5, 0x4e20cd47, 0x5a9ad5d9,
+ 0x512c0c03, 0xea857ccd, 0x4cc1d30f, 0x8891a8a1, 0xa6b7aadb
+ };
+ static unsigned long mag01[2]={
+ 0x0, 0x8ebfd028 /* this is magic vector `a', don't change */
+ };
+ if (k==N) { /* generate N words at one time */
+ int kk;
+ for (kk=0;kk<N-M;kk++) {
+ x[kk] = x[kk+M] ^ (x[kk] >> 1) ^ mag01[x[kk] % 2];
+ }
+ for (; kk<N;kk++) {
+ x[kk] = x[kk+(M-N)] ^ (x[kk] >> 1) ^ mag01[x[kk] % 2];
+ }
+ k=0;
+ }
+ y = x[k];
+ y ^= (y << 7) & 0x2b5b2500; /* s and b, magic vectors */
+ y ^= (y << 15) & 0xdb8b0000; /* t and c, magic vectors */
+ y &= 0xffffffff; /* you may delete this line if word size = 32 */
+/*
+ the following line was added by Makoto Matsumoto in the 1996 version
+ to improve lower bit's corellation.
+ Delete this line to o use the code published in 1994.
+*/
+ y ^= (y >> 16); /* added to the 1994 version */
+ k++;
+ return( (double) y / (unsigned long) 0xffffffff);
+}
+
+/* this main() output first 50 generated numbers */
+main()
+{ int j;
+ for (j=0; j<100000; j++) {
+ printf("%5f ", genrand());
+ if (j%8==7) printf("\n");
+ }
+ printf("\n");
+}
diff --git a/libutil/palloc.c b/libutil/palloc.c
new file mode 100644
index 0000000..db848ad
--- /dev/null
+++ b/libutil/palloc.c
@@ -0,0 +1,236 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include "util.h"
+
+typedef struct pallocroot pallocroot;
+typedef struct pallocnode pallocnode;
+
+// _dbg: 0 -- print nothing
+// 1 -- print block allocation
+// 2 -- print all allocations
+
+struct pallocroot {
+ size_t _bs; // size of block
+ pallocnode *_nl; // nodeList
+ pallocnode *_cn; // currentNode
+ int _dbg; // if set, debug information is printed
+};
+
+struct pallocnode {
+ size_t _cp; // cuurentPosition
+ char *_dt; // data
+ pallocnode *_nx; // next pallocnode
+};
+
+extern pallocroot _palloc_stuff;
+
+pallocroot _palloc_stuff = { 128 * 1024 * 1024, NULL, NULL, 0 };
+
+static
+void *
+really_allocate(size_t size) {
+ void *ret = malloc(size);
+ if (ret == 0L) {
+ fprintf(stderr, "palloc()-- can't allocate "sizetFMT" bytes: %s.\n", size, strerror(errno));
+ exit(1);
+ }
+ return(ret);
+}
+
+void
+psetblocksize(size_t size) {
+ if (_palloc_stuff._nl == 0L)
+ _palloc_stuff._bs = size;
+}
+
+size_t
+pgetblocksize(void) {
+ return(_palloc_stuff._bs);
+}
+
+void
+psetdebug(int on) {
+ _palloc_stuff._dbg = on;
+}
+
+void*
+pallochandle(size_t size) {
+ pallocroot *root = (pallocroot *)malloc(sizeof(pallocroot));
+ if (root == NULL)
+ fprintf(stderr, "pallochandle()-- can't allocate a handle!\n"), exit(1);
+ if (size == 0)
+ size = 128 * 1024 * 1024;
+ root->_bs = size;
+ root->_nl = NULL;
+ root->_cn = NULL;
+ root->_dbg = 0;
+ return(root);
+}
+
+
+// Release a palloc handle, does not release the memory in the handle!
+void
+pfreehandle(void *handle) {
+ free((pallocroot *)handle);
+}
+
+// Clear out memory inside the handle. The handle remains valid after this.
+void
+pfree2(void *handle) {
+ pallocroot *root = (pallocroot *)handle;
+ pallocnode *n;
+ size_t r = 0;
+ size_t b = 0;
+
+ if (root == NULL)
+ root = &_palloc_stuff;
+
+ while ((n = root->_nl) != 0L) {
+ r += n->_cp;
+ b++;
+ root->_nl = n->_nx;
+ free(n->_dt);
+ free(n);
+ }
+
+ if (root->_dbg > 0)
+ fprintf(stderr, "palloc()-- "sizetFMT" bytes in "sizetFMT" blocks returned to free store.\n", r, b);
+
+ root->_nl = 0L;
+ root->_cn = 0L;
+}
+
+void
+pfree(void) {
+ pfree2(&_palloc_stuff);
+}
+
+
+void *
+palloc2(size_t size, void *handle) {
+ pallocroot *root = (pallocroot *)handle;
+
+ if (root == NULL)
+ root = &_palloc_stuff;
+
+ // Make size a multiple of 8
+ //
+ if (size & 0x7) {
+ size >>= 3;
+ size++;
+ size <<= 3;
+ }
+ if (size == 0)
+ return(0L);
+
+ // Allocate the initial block if it doesn't exist.
+ //
+ if (root->_nl == NULL) {
+ root->_nl = (pallocnode *)really_allocate(sizeof(pallocnode));
+ root->_cn = root->_nl;
+
+ if (root->_dbg > 0)
+ fprintf(stderr, "palloc()-- Inital block of "sizetFMT" bytes at %p.\n", root->_bs, root->_cn);
+
+ root->_cn->_cp = 0;
+ root->_cn->_dt = (char *)really_allocate(root->_bs);
+ root->_cn->_nx = NULL;
+ }
+
+
+ // If the requested space is larger than our block size, allocate a
+ // new node with the required amount of space. The new node is
+ // placed on the start of the alloc'd list.
+ //
+ // We also place blocks that are bigger than the amount free in the
+ // current block, AND bigger than the amount used in the current
+ // block here. Since the new block is larger than the free space,
+ // it won't fit in the current block. Since the new block is
+ // larger than the current block, it is wasteful to throw out the
+ // current block and replace it with a new block.
+ //
+ // The tests read:
+ // new block is bigger than our block size
+ // new block won't fit in current block
+ // new block is larger than current block
+ //
+ if ((size > root->_bs) ||
+ ((size > root->_bs - root->_cn->_cp) &&
+ (size > root->_cn->_cp))) {
+ pallocnode *n;
+
+ n = (pallocnode *)really_allocate(sizeof(pallocnode));
+ n->_cp = size;
+ n->_dt = (char *)really_allocate(size);
+ n->_nx = root->_nl;
+
+ if (root->_dbg > 0)
+ fprintf(stderr, "palloc()-- New needs "sizetFMT" bytes: custom new block at %p.\n",
+ size,
+ n);
+
+ root->_nl = n;
+ if (root->_cn == 0L)
+ root->_cn = n;
+
+ return(n->_dt);
+ }
+
+
+ // Need more space?
+ //
+ if (size + root->_cn->_cp > root->_bs) {
+ root->_cn->_nx = (pallocnode *)really_allocate(sizeof(pallocnode));
+
+ if (root->_dbg > 0)
+ fprintf(stderr, "palloc()-- Old block %.3f%% used ("sizetFMT" bytes remaining), new needs "sizetFMT" bytes: new block of "sizetFMT" bytes at %p.\n",
+ 100.0 * root->_cn->_cp / root->_bs,
+ root->_bs - root->_cn->_cp,
+ size,
+ root->_bs,
+ root->_cn->_nx);
+
+ root->_cn = root->_cn->_nx;
+ root->_cn->_cp = 0;
+ root->_cn->_dt = (char *)really_allocate(root->_bs);
+ root->_cn->_nx = NULL;
+ }
+
+
+ // OK, grab the space, and return it.
+ //
+ root->_cn->_cp += size;
+
+ if (root->_dbg > 1)
+ fprintf(stderr, "palloc()-- Old block %.3f%% used ("sizetFMT" bytes remaining): returning "sizetFMT" bytes at %p.\n",
+ 100.0 * root->_cn->_cp / root->_bs,
+ root->_bs - root->_cn->_cp,
+ size, root->_cn->_dt + root->_cn->_cp - size);
+
+ return(root->_cn->_dt + root->_cn->_cp - size);
+}
+
+
+
+void *
+palloc(size_t size) {
+ return(palloc2(size, &_palloc_stuff));
+}
+
+
+void
+pdumppalloc(void *handle) {
+ pallocroot *root = (pallocroot *)handle;
+ pallocnode *n = root->_nl;
+ fprintf(stderr, "palloc dump\n");
+ fprintf(stderr, ""sizetFMT" bytes per block\n", root->_bs);
+ while (n != 0L) {
+ fprintf(stderr, "%p: currentPosition: "sizetFMT" bytes used%s\n",
+ n, n->_cp, (n == root->_cn) ? ", current block" : "");
+ n = n->_nx;
+ }
+}
+
diff --git a/libutil/qsort_mt.c b/libutil/qsort_mt.c
new file mode 100644
index 0000000..bb0b722
--- /dev/null
+++ b/libutil/qsort_mt.c
@@ -0,0 +1,406 @@
+/*-
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ * Multithread implementation Copyright (c) 2006, 2007 Diomidis Spinellis.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+//static char sccsid[] = "@(#)qsort.c 8.1 (Berkeley) 6/4/93";
+//__FBSDID("$FreeBSD: src/lib/libc/stdlib/qsort.c,v 1.12 2002/09/10 02:04:49 wollman Exp $");
+
+//#include <sys/cdefs.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+
+#include <sys/types.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#ifdef __FreeBSD__
+#include <pmc.h>
+#endif
+
+typedef int cmp_t(const void *, const void *);
+
+static inline char *med3(char *, char *, char *, cmp_t *);
+static inline void swapfunc(char *, char *, int, int);
+
+#define min(a, b) (a) < (b) ? a : b
+
+/*
+ * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
+ */
+#define swapcode(TYPE, parmi, parmj, n) { \
+ long i = (n) / sizeof (TYPE); \
+ TYPE *pi = (TYPE *) (parmi); \
+ TYPE *pj = (TYPE *) (parmj); \
+ do { \
+ TYPE t = *pi; \
+ *pi++ = *pj; \
+ *pj++ = t; \
+ } while (--i > 0); \
+ }
+
+
+static inline void
+swapfunc(a, b, n, swaptype)
+ char *a, *b;
+ int n, swaptype;
+{
+ if(swaptype <= 1)
+ swapcode(long, a, b, n)
+ else
+ swapcode(char, a, b, n)
+ }
+
+#define swap(a, b) \
+ if (swaptype == 0) { \
+ long t = *(long *)(a); \
+ *(long *)(a) = *(long *)(b); \
+ *(long *)(b) = t; \
+ } else \
+ swapfunc(a, b, es, swaptype)
+
+#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype)
+
+#define CMP(x, y) (cmp((x), (y)))
+
+static inline char *
+med3(char *a, char *b, char *c, cmp_t *cmp)
+{
+ return CMP(a, b) < 0 ?
+ (CMP(b, c) < 0 ? b : (CMP(a, c) < 0 ? c : a ))
+ :(CMP(b, c) > 0 ? b : (CMP(a, c) < 0 ? a : c ));
+}
+
+/*
+ * We use some elaborate condition variables and signalling
+ * to ensure a bound of the number of active threads at
+ * 2 * maxthreads and the size of the thread data structure
+ * to maxthreads.
+ */
+
+/* Condition of starting a new thread. */
+enum thread_state {
+ ts_idle, /* Idle, waiting for instructions. */
+ ts_work, /* Has work to do. */
+ ts_term /* Asked to terminate. */
+};
+
+/* Variant part passed to qsort invocations. */
+struct qsort {
+ enum thread_state st; /* For coordinating work. */
+ struct common *common; /* Common shared elements. */
+ void *a; /* Array base. */
+ size_t n; /* Number of elements. */
+ pthread_t id; /* Thread id. */
+ pthread_mutex_t mtx_st; /* For signalling state change. */
+ pthread_cond_t cond_st; /* For signalling state change. */
+};
+
+/* Invariant common part, shared across invocations. */
+struct common {
+ int swaptype; /* Code to use for swapping */
+ size_t es; /* Element size. */
+ cmp_t *cmp; /* Comparison function */
+ int nthreads; /* Total number of pool threads. */
+ int idlethreads; /* Number of idle threads in pool. */
+ int forkelem; /* Minimum number of elements for a new thread. */
+ struct qsort *pool; /* Fixed pool of threads. */
+ pthread_mutex_t mtx_al; /* For allocating threads in the pool. */
+};
+
+static void *qsort_thread(void *p);
+
+/* The multithreaded qsort public interface */
+
+void
+qsort_mt(void *a, size_t n, size_t es, cmp_t *cmp, int maxthreads, int forkelem)
+{
+ struct qsort *qs;
+ struct common c;
+ int i, islot;
+ int bailout = 1;
+
+ if (n < forkelem)
+ goto f1;
+ errno = 0;
+
+ if (maxthreads <= 1)
+ goto f1;
+
+ /* Try to initialize the resources we need. */
+ if (pthread_mutex_init(&c.mtx_al, NULL) != 0)
+ goto f1;
+ if ((c.pool = (struct qsort *)calloc(maxthreads, sizeof(struct qsort))) ==NULL)
+ goto f2;
+ for (islot = 0; islot < maxthreads; islot++) {
+ qs = &c.pool[islot];
+ if (pthread_mutex_init(&qs->mtx_st, NULL) != 0)
+ goto f3;
+ if (pthread_cond_init(&qs->cond_st, NULL) != 0) {
+ pthread_mutex_destroy(&qs->mtx_st);
+ goto f3;
+ }
+ qs->st = ts_idle;
+ qs->common = &c;
+ if (pthread_create(&qs->id, NULL, qsort_thread, qs) != 0) {
+ pthread_mutex_destroy(&qs->mtx_st);
+ pthread_cond_destroy(&qs->cond_st);
+ goto f3;
+ }
+ }
+
+ /* All systems go. */
+ bailout = 0;
+
+ /* Initialize common elements. */
+ c.swaptype = ((char *)a - (char *)0) % sizeof(long) || \
+ es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;
+ c.es = es;
+ c.cmp = cmp;
+ c.forkelem = forkelem;
+ c.idlethreads = c.nthreads = maxthreads;
+
+ /* Hand out the first work batch. */
+ qs = &c.pool[0];
+ pthread_mutex_lock(&qs->mtx_st);
+ qs->a = a;
+ qs->n = n;
+ qs->st = ts_work;
+ c.idlethreads--;
+ pthread_cond_signal(&qs->cond_st);
+ pthread_mutex_unlock(&qs->mtx_st);
+
+ /*
+ * Wait for all threads to finish, and
+ * free acquired resources.
+ */
+ f3: for (i = 0; i < islot; i++) {
+ qs = &c.pool[i];
+ if (bailout) {
+ pthread_mutex_lock(&qs->mtx_st);
+ qs->st = ts_term;
+ pthread_cond_signal(&qs->cond_st);
+ pthread_mutex_unlock(&qs->mtx_st);
+ }
+ pthread_join(qs->id, NULL);
+ pthread_mutex_destroy(&qs->mtx_st);
+ pthread_cond_destroy(&qs->cond_st);
+ }
+ free(c.pool);
+ f2: pthread_mutex_destroy(&c.mtx_al);
+ if (bailout) {
+ /* XXX should include a syslog call here */
+ fprintf(stderr, "Resource initialization failed; bailing out.\n");
+ f1: qsort(a, n, es, cmp);
+ }
+}
+
+
+/*
+ * Allocate an idle thread from the pool, lock its
+ * mutex, change its state to work, decrease the number
+ * of idle threads, and return a
+ * pointer to its data area.
+ * Return NULL, if no thread is available.
+ */
+static struct qsort *
+allocate_thread(struct common *c)
+{
+ int i;
+
+ pthread_mutex_lock(&c->mtx_al);
+ for (i = 0; i < c->nthreads; i++)
+ if (c->pool[i].st == ts_idle) {
+ c->idlethreads--;
+ c->pool[i].st = ts_work;
+ pthread_mutex_lock(&c->pool[i].mtx_st);
+ pthread_mutex_unlock(&c->mtx_al);
+ return (&c->pool[i]);
+ }
+ pthread_mutex_unlock(&c->mtx_al);
+ return (NULL);
+}
+
+/* Thread-callable quicksort. */
+static void
+qsort_algo(struct qsort *qs)
+{
+ char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+ long d, r, swaptype, swap_cnt;
+ void *a; /* Array of elements. */
+ size_t n, es; /* Number of elements; size. */
+ cmp_t *cmp;
+ long nl, nr;
+ struct common *c;
+ struct qsort *qs2;
+ pthread_t id;
+
+ /* Initialize qsort arguments. */
+ id = qs->id;
+ c = qs->common;
+ es = c->es;
+ cmp = c->cmp;
+ swaptype = c->swaptype;
+ a = qs->a;
+ n = qs->n;
+ top:
+
+ /* From here on qsort(3) business as usual. */
+ swap_cnt = 0;
+ if (n < 7) {
+ for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
+ for (pl = pm;
+ pl > (char *)a && CMP(pl - es, pl) > 0;
+ pl -= es)
+ swap(pl, pl - es);
+ return;
+ }
+ pm = (char *)a + (n / 2) * es;
+ if (n > 7) {
+ pl = a;
+ pn = (char *)a + (n - 1) * es;
+ if (n > 40) {
+ d = (n / 8) * es;
+ pl = med3(pl, pl + d, pl + 2 * d, cmp);
+ pm = med3(pm - d, pm, pm + d, cmp);
+ pn = med3(pn - 2 * d, pn - d, pn, cmp);
+ }
+ pm = med3(pl, pm, pn, cmp);
+ }
+ swap(a, pm);
+ pa = pb = (char *)a + es;
+
+ pc = pd = (char *)a + (n - 1) * es;
+ for (;;) {
+ while (pb <= pc && (r = CMP(pb, a)) <= 0) {
+ if (r == 0) {
+ swap_cnt = 1;
+ swap(pa, pb);
+ pa += es;
+ }
+ pb += es;
+ }
+ while (pb <= pc && (r = CMP(pc, a)) >= 0) {
+ if (r == 0) {
+ swap_cnt = 1;
+ swap(pc, pd);
+ pd -= es;
+ }
+ pc -= es;
+ }
+ if (pb > pc)
+ break;
+ swap(pb, pc);
+ swap_cnt = 1;
+ pb += es;
+ pc -= es;
+ }
+ if (swap_cnt == 0) { /* Switch to insertion sort */
+ for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
+ for (pl = pm;
+ pl > (char *)a && CMP(pl - es, pl) > 0;
+ pl -= es)
+ swap(pl, pl - es);
+ return;
+ }
+
+ pn = (char *)a + n * es;
+ r = min(pa - (char *)a, pb - pa);
+ vecswap(a, pb - r, r);
+ r = min(pd - pc, pn - pd - es);
+ vecswap(pb, pn - r, r);
+
+ nl = (pb - pa) / es;
+ nr = (pd - pc) / es;
+
+ /* Now try to launch subthreads. */
+ if (nl > c->forkelem && nr > c->forkelem &&
+ (qs2 = allocate_thread(c)) != NULL) {
+ qs2->a = a;
+ qs2->n = nl;
+ pthread_cond_signal(&qs2->cond_st);
+ pthread_mutex_unlock(&qs2->mtx_st);
+ } else if (nl > 0) {
+ qs->a = a;
+ qs->n = nl;
+ qsort_algo(qs);
+ }
+ if (nr > 0) {
+ a = pn - nr * es;
+ n = nr;
+ goto top;
+ }
+}
+
+/* Thread-callable quicksort. */
+static void *
+qsort_thread(void *p)
+{
+ struct qsort *qs, *qs2;
+ int i;
+ struct common *c;
+ pthread_t id;
+
+ qs = p;
+ id = qs->id;
+ c = qs->common;
+ again:
+ /* Wait for work to be allocated. */
+ pthread_mutex_lock(&qs->mtx_st);
+ while (qs->st == ts_idle)
+ pthread_cond_wait(&qs->cond_st, &qs->mtx_st);
+ pthread_mutex_unlock(&qs->mtx_st);
+ if (qs->st == ts_term) {
+ return(NULL);
+ }
+ assert(qs->st == ts_work);
+
+ qsort_algo(qs);
+
+ pthread_mutex_lock(&c->mtx_al);
+ qs->st = ts_idle;
+ c->idlethreads++;
+ if (c->idlethreads == c->nthreads) {
+ for (i = 0; i < c->nthreads; i++) {
+ qs2 = &c->pool[i];
+ if (qs2 == qs)
+ continue;
+ pthread_mutex_lock(&qs2->mtx_st);
+ qs2->st = ts_term;
+ pthread_cond_signal(&qs2->cond_st);
+ pthread_mutex_unlock(&qs2->mtx_st);
+ }
+ pthread_mutex_unlock(&c->mtx_al);
+ return(NULL);
+ }
+ pthread_mutex_unlock(&c->mtx_al);
+ goto again;
+}
diff --git a/libutil/readBuffer.C b/libutil/readBuffer.C
new file mode 100644
index 0000000..d49d795
--- /dev/null
+++ b/libutil/readBuffer.C
@@ -0,0 +1,284 @@
+#include "util++.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+
+
+// If bufferMax is zero, then the file is accessed using memory
+// mapped I/O. Otherwise, a small buffer is used.
+//
+readBuffer::readBuffer(const char *filename, uint64 bufferMax) {
+
+ _filename = 0L;
+ _file = 0;
+ _filePos = 0;
+ _mmap = false;
+ _stdin = false;
+ _eof = false;
+ _bufferPos = 0;
+ _bufferLen = 0;
+ _bufferMax = 0;
+ _buffer = 0L;
+
+ if (((filename == 0L) && (isatty(fileno(stdin)) == 0)) ||
+ ((filename != 0L) && (filename[0] == '-') && (filename[1] == 0))) {
+ _filename = new char [32];
+ strcpy(_filename, "(stdin)");
+
+ _stdin = true;
+
+ if (bufferMax == 0)
+ bufferMax = 32 * 1024;
+ } else if (filename == 0L) {
+ fprintf(stderr, "readBuffer()-- no filename supplied, and I will not use the terminal for input.\n"), exit(1);
+ } else {
+ _filename = new char [strlen(filename) + 1];
+ strcpy(_filename, filename);
+ }
+
+ if (bufferMax == 0) {
+ _mmap = true;
+ _buffer = (char *)mapFile(_filename, &_bufferLen, 'r');
+ } else {
+ errno = 0;
+ _file = (_stdin) ? fileno(stdin) : open(_filename, O_RDONLY | O_LARGEFILE);
+ if (errno)
+ fprintf(stderr, "readBuffer()-- couldn't open the file '%s': %s\n",
+ _filename, strerror(errno)), exit(1);
+
+ _bufferMax = bufferMax;
+ _buffer = new char [_bufferMax];
+ }
+
+ fillBuffer();
+
+ if (_bufferLen == 0)
+ _eof = true;
+}
+
+
+readBuffer::readBuffer(FILE *file, uint64 bufferMax) {
+
+ if (bufferMax == 0)
+ fprintf(stderr, "readBuffer()-- WARNING: mmap() not supported in readBuffer(FILE *)\n");
+
+ _filename = new char [32];
+ _file = fileno(file);
+ _filePos = 0;
+ _mmap = false;
+ _stdin = false;
+ _eof = false;
+ _bufferPos = 0;
+ _bufferLen = 0;
+ _bufferMax = (bufferMax == 0) ? 32 * 1024 : bufferMax;
+ _buffer = new char [_bufferMax];
+
+ strcpy(_filename, "(hidden file)");
+
+ // Just be sure that we are at the start of the file.
+ errno = 0;
+ lseek(_file, 0, SEEK_SET);
+ if ((errno) && (errno != ESPIPE))
+ fprintf(stderr, "readBuffer()-- '%s' couldn't seek to position 0: %s\n",
+ _filename, strerror(errno)), exit(1);
+
+ fillBuffer();
+
+ if (_bufferLen == 0)
+ _eof = true;
+}
+
+
+readBuffer::~readBuffer() {
+
+ delete [] _filename;
+
+ if (_mmap)
+ unmapFile(_buffer, _bufferLen);
+ else
+ delete [] _buffer;
+
+ if (_stdin == false)
+ close(_file);
+}
+
+
+void
+readBuffer::fillBuffer(void) {
+
+ // If there is still stuff in the buffer, no need to fill.
+ if (_bufferPos < _bufferLen)
+ return;
+
+ // No more stuff in the buffer. But if mmap'd, ths means we're EOF.
+ if (_mmap) {
+ _eof = true;
+ return;
+ }
+
+ _bufferPos = 0;
+ _bufferLen = 0;
+
+ again:
+ errno = 0;
+ _bufferLen = (uint64)::read(_file, _buffer, _bufferMax);
+ if (errno == EAGAIN)
+ goto again;
+ if (errno)
+ fprintf(stderr, "readBuffer::fillBuffer()-- only read "uint64FMT" bytes, couldn't read "uint64FMT" bytes from '%s': %s\n",
+ _bufferLen, _bufferMax, _filename, strerror(errno)), exit(1);
+
+ if (_bufferLen == 0)
+ _eof = true;
+}
+
+
+void
+readBuffer::seek(uint64 pos) {
+
+ if (_stdin == true) {
+ if (_filePos < _bufferLen) {
+ _filePos = 0;
+ _bufferPos = 0;
+ return;
+ } else {
+ fprintf(stderr, "readBuffer()-- seek() not available for file 'stdin'.\n");
+ exit(1);
+ }
+
+ return;
+ }
+
+ assert(_stdin == false);
+
+ if (_mmap) {
+ _bufferPos = pos;
+ _filePos = pos;
+ } else {
+ errno = 0;
+ lseek(_file, pos, SEEK_SET);
+ if (errno)
+ fprintf(stderr, "readBuffer()-- '%s' couldn't seek to position "int64FMT": %s\n",
+ _filename, pos, strerror(errno)), exit(1);
+
+ _bufferLen = 0;
+ _bufferPos = 0;
+ _filePos = pos;
+
+ fillBuffer();
+ }
+
+ _eof = (_bufferPos >= _bufferLen);
+}
+
+
+uint64
+readBuffer::read(void *buf, uint64 len) {
+ char *bufchar = (char *)buf;
+
+ // Handle the mmap'd file first.
+
+ if (_mmap) {
+ uint64 c = 0;
+
+ while ((_bufferPos < _bufferLen) && (c < len)) {
+ bufchar[c++] = _buffer[_bufferPos++];
+ _filePos++;
+ }
+
+ if (c == 0)
+ _eof = true;
+
+ return(c);
+ }
+
+ // Easy case; the next len bytes are already in the buffer; just
+ // copy and move the position.
+
+ if (_bufferLen - _bufferPos > len) {
+ memcpy(bufchar, _buffer + _bufferPos, len);
+ _bufferPos += len;
+
+ fillBuffer();
+
+ _filePos += len;
+
+ return(len);
+ }
+
+ // Existing buffer not big enough. Copy what's there, then finish
+ // with a read.
+
+ uint64 bCopied = 0; // Number of bytes copied into the buffer
+ uint64 bRead = 0; // Number of bytes read into the buffer
+ uint64 bAct = 0; // Number of bytes actually read from disk
+
+ memcpy(bufchar, _buffer + _bufferPos, _bufferLen - _bufferPos);
+ bCopied = _bufferLen - _bufferPos;
+ _bufferPos = _bufferLen;
+
+ while (bCopied + bRead < len) {
+ errno = 0;
+ bAct = (uint64)::read(_file, bufchar + bCopied + bRead, len - bCopied - bRead);
+ if (errno)
+ fprintf(stderr, "readBuffer()-- couldn't read "uint64FMT" bytes from '%s': n%s\n",
+ len, _filename, strerror(errno)), exit(1);
+
+ // If we hit EOF, return a short read
+ if (bAct == 0)
+ len = 0;
+
+ bRead += bAct;
+ }
+
+ fillBuffer();
+
+ _filePos += bCopied + bRead;
+
+ return(bCopied + bRead);
+}
+
+
+uint64
+readBuffer::read(void *buf, uint64 maxlen, char stop) {
+ char *bufchar = (char *)buf;
+ uint64 c = 0;
+
+ // We will copy up to 'maxlen'-1 bytes into 'buf', or stop at the first occurrence of 'stop'.
+ // This will reserve space at the end of any string for a zero-terminating byte.
+ maxlen--;
+
+ if (_mmap) {
+ // Handle the mmap'd file first.
+ while ((_bufferPos < _bufferLen) &&
+ (c < maxlen)) {
+ bufchar[c++] = _buffer[_bufferPos++];
+
+ if (bufchar[c-1] == stop)
+ break;
+ }
+
+ if (_bufferPos >= _bufferLen)
+ _eof = true;
+
+ } else {
+ // And the usual case.
+ while ((_eof == false) && (c < maxlen)) {
+ bufchar[c++] = _buffer[_bufferPos++];
+
+ if (_bufferPos >= _bufferLen)
+ fillBuffer();
+
+ if (bufchar[c-1] == stop)
+ break;
+ }
+ }
+
+ bufchar[c] = 0;
+
+ return(c);
+}
diff --git a/libutil/readBuffer.H b/libutil/readBuffer.H
new file mode 100644
index 0000000..3621588
--- /dev/null
+++ b/libutil/readBuffer.H
@@ -0,0 +1,86 @@
+#ifndef READ_BUFFER_H
+#define READ_BUFFER_H
+
+
+class readBuffer {
+public:
+ readBuffer(const char *filename, uint64 bufferMax = 32 * 1024);
+ readBuffer(FILE *F, uint64 bufferMax = 32 * 1024);
+ ~readBuffer();
+
+ bool eof(void) { return(_eof); };
+
+ char peek(void);
+
+ char read(void);
+ uint64 read(void *buf, uint64 len);
+ uint64 read(void *buf, uint64 maxlen, char stop);
+
+ void seek(uint64 pos);
+ uint64 tell(void) { return(_filePos); };
+
+ const char *filename(void) { return(_filename); };
+
+private:
+ void fillBuffer(void);
+ void init(int fileptr, const char *filename, uint64 bufferMax);
+
+ char *_filename;
+
+ int _file;
+ uint64 _filePos;
+
+ bool _mmap;
+ bool _stdin;
+
+ bool _eof;
+
+ // If bufferMax is zero, then we are using the mmapped interface, otherwise,
+ // we are using a open()/read() and a small buffer.
+
+ uint64 _bufferPos;
+ uint64 _bufferLen;
+ uint64 _bufferMax;
+ char *_buffer;
+};
+
+
+// Returns the next letter in the buffer, but DOES NOT advance past
+// it. Might have some wierd interaction with EOF -- if you peek()
+// and the next thing is eof , the _eof flag might get set.
+//
+inline
+char
+readBuffer::peek(void) {
+
+ if ((_eof == false) && (_bufferPos >= _bufferLen))
+ fillBuffer();
+
+ if (_eof)
+ return(0);
+
+ return(_buffer[_bufferPos]);
+}
+
+
+// Returns the next letter in the buffer. Returns EOF (0) if there
+// is no next letter.
+//
+inline
+char
+readBuffer::read(void) {
+
+ if ((_eof == false) && (_bufferPos >= _bufferLen))
+ fillBuffer();
+
+ if (_eof)
+ return(0);
+
+ _bufferPos++;
+ _filePos++;
+
+ return(_buffer[_bufferPos-1]);
+}
+
+
+#endif // READ_BUFFER_H
diff --git a/libutil/recordFile.C b/libutil/recordFile.C
new file mode 100644
index 0000000..2b6ec81
--- /dev/null
+++ b/libutil/recordFile.C
@@ -0,0 +1,320 @@
+#include "util++.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+
+// N.B. any read() / write() pair (either order) must have a seek (or
+// a fflush) in between.
+
+uint64 recordFileMagic1 = 0x694664726f636572llu;
+uint64 recordFileMagic2 = 0x000000000000656cllu;
+
+recordFile::recordFile(char const *name,
+ uint32 headerSize,
+ uint32 recordSize,
+ char mode) {
+
+ _file = 0;
+ _name = new char [strlen(name) + 1];
+ strcpy(_name, name);
+
+ _numRecords = 0;
+ _recordSize = recordSize;
+
+ _headerSize = headerSize;
+ _header = new char [_headerSize];
+
+ memset(_header, 0, sizeof(char) * _headerSize);
+
+ _bfrmax = MAX(1048576 / _recordSize, 16);
+ _bfr = new char [_bfrmax * _recordSize];
+
+ _limit = ~uint32ZERO;
+
+ _pos = uint64ZERO;
+ _rec = 0;
+
+ memset(_bfr, 0, sizeof(char) * _bfrmax * _recordSize);
+
+ _bfrDirty = false;
+ _isReadOnly = true;
+
+ if ((mode != 'r') && (mode != 'w') && (mode |= 'a')) {
+ fprintf(stderr, "recordFile::recordFile()-- Invalid mode '%c'.\n", mode);
+ exit(1);
+ }
+
+ // If the file doesn't exist, or we're opening for write, we're
+ // basically done. Do that first.
+ // Write the magic.
+ // Write the metadata.
+ // Write the header.
+
+ if (((mode == 'w')) ||
+ ((mode == 'a') && (fileExists(_name) == false))) {
+ errno = 0;
+ _file = open(_name,
+ O_RDWR | O_CREAT | O_TRUNC | O_LARGEFILE,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno)
+ fprintf(stderr, "recordFile::recordFile()-- failed to open '%s': %s\n",
+ _name, strerror(errno)), exit(1);
+ _isReadOnly = false;
+
+ write(_file, &recordFileMagic1, sizeof(uint64));
+ write(_file, &recordFileMagic2, sizeof(uint64));
+ write(_file, &_numRecords, sizeof(uint64));
+ write(_file, &_recordSize, sizeof(uint32));
+ write(_file, &_headerSize, sizeof(uint32));
+ write(_file, _header, sizeof(char) * _headerSize);
+
+ if (errno)
+ fprintf(stderr, "recordFile::recordFile()-- failed to write header to '%s': %s\n",
+ _name, strerror(errno)), exit(1);
+
+ return;
+ }
+
+ // File does exist. If we're not appending, open it read-only.
+ // Otherwise, open read-write.
+
+ if (mode == 'r') {
+ errno = 0;
+ _file = open(_name,
+ O_RDONLY | O_LARGEFILE,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno)
+ fprintf(stderr, "recordFile::recordFile()-- failed to open '%s': %s\n",
+ _name, strerror(errno)), exit(1);
+ _isReadOnly = true;
+ } else {
+ errno = 0;
+ _file = open(_name,
+ O_RDWR | O_LARGEFILE,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno)
+ fprintf(stderr, "recordFile::recordFile()-- failed to open for write '%s': %s\n",
+ _name, strerror(errno)), exit(1);
+ _isReadOnly = false;
+ }
+
+ // Read the magic, metadata and header.
+
+ {
+ uint64 m1, m2;
+
+ errno = 0;
+
+ read(_file, &m1, sizeof(uint64));
+ read(_file, &m2, sizeof(uint64));
+ read(_file, &_numRecords, sizeof(uint64));
+ read(_file, &_recordSize, sizeof(uint32));
+ read(_file, &_headerSize, sizeof(uint32));
+ read(_file, _header, sizeof(char) * _headerSize);
+
+ if (errno)
+ fprintf(stderr, "recordFile::recordFile()-- failed to read header from '%s': %s\n",
+ _name, strerror(errno)), exit(1);
+
+ if ((m1 != recordFileMagic1) || (m2 != recordFileMagic2))
+ fprintf(stderr, "recordFile::recordFile()-- magic number disagreement; '%s' not a recordFile?\n",
+ _name), exit(1);
+ }
+
+ if (mode == 'a') {
+ _pos = _numRecords;
+ _rec = 0;
+
+ errno = 0;
+ lseek(_file, 0, SEEK_END);
+ if (errno)
+ fprintf(stderr, "recordFile::recordFile()-- seek to end of '%s' failed: %s\n", _name, strerror(errno)), exit(1);
+ } else {
+ seek(0, true);
+ }
+}
+
+
+recordFile::~recordFile() {
+ flushDirty();
+
+ if (_isReadOnly == false) {
+ errno = 0;
+ lseek(_file, 0, SEEK_SET);
+ if (errno)
+ fprintf(stderr, "recordFile::~recordFile()-- seek to start of '%s' failed: %s\n", _name, strerror(errno)), exit(1);
+
+ write(_file, &recordFileMagic1, sizeof(uint64));
+ write(_file, &recordFileMagic2, sizeof(uint64));
+ write(_file, &_numRecords, sizeof(uint64));
+ write(_file, &_recordSize, sizeof(uint32));
+ write(_file, &_headerSize, sizeof(uint32));
+ write(_file, _header, sizeof(char) * _headerSize);
+
+ if (errno)
+ fprintf(stderr, "recordFile::~recordFile()-- failed to write header to '%s': %s\n",
+ _name, strerror(errno)), exit(1);
+ }
+
+ close(_file);
+
+ if (errno)
+ fprintf(stderr, "recordFile::~recordFile()-- failed to close '%s': %s\n",
+ _name, strerror(errno)), exit(1);
+
+ delete [] _bfr;
+ delete [] _name;
+ delete [] _header;
+}
+
+
+
+// If the page is dirty, flush it to disk
+//
+void
+recordFile::flushDirty(void) {
+
+ if (_bfrDirty == false)
+ return;
+
+ if (_isReadOnly)
+ fprintf(stderr, "recordFile::recordFile()-- '%s' is readonly, but is dirty!\n", _name), exit(1);
+
+ errno = 0;
+ lseek(_file, 32 + _headerSize + _pos * _recordSize, SEEK_SET);
+ if (errno)
+ fprintf(stderr, "recordFile::seek()-- '%s' failed: %s\n", _name, strerror(errno)), exit(1);
+
+ // Write records up to, not including, _rec. Unlike the
+ // bitPackedFile, there is no issue with partially filled words
+ // here.
+ //
+ errno = 0;
+ write(_file, _bfr, _recordSize * _rec);
+ if (errno)
+ fprintf(stderr, "recordFile::write()-- '%s' failed: %s\n", _name, strerror(errno)), exit(1);
+
+ _bfrDirty = false;
+}
+
+
+
+// Seeks to rec in the file, reads in a new block.
+//
+void
+recordFile::seek(uint64 rec, bool forced) {
+
+ // If we are seeking to somewhere in the current block, don't do a
+ // real seek, just move our position within the block.
+ //
+ if ((forced == false) && (_pos <= rec) && (rec < _pos + _bfrmax)) {
+ _rec = rec - _pos;
+ return;
+ }
+
+ flushDirty();
+
+ _pos = rec; // Root of buffer is now here
+ _rec = 0; // See?
+
+ errno = 0;
+ lseek(_file, 32 + _headerSize + _pos * _recordSize, SEEK_SET);
+ if (errno)
+ fprintf(stderr, "recordFile::seek() '%s' seek to record="uint64FMT" at fileposition="uint64FMT" failed: %s\n",
+ _name, _pos, _headerSize + _pos * _recordSize, strerror(errno)), exit(1);
+
+ errno = 0;
+ read(_file, _bfr, _recordSize * _bfrmax);
+ if (errno)
+ fprintf(stderr, "recordFile::seek() '%s' read of "uint64FMT" bytes failed at record "uint64FMT", fileposition "uint64FMT"': %s\n",
+ _name, _recordSize * _bfrmax, _pos, _headerSize + _pos * _recordSize, strerror(errno)), exit(1);
+}
+
+
+
+uint32
+recordFile::getRecord(void *record, uint32 num) {
+ uint32 maxnum = _bfrmax / 2;
+
+ // Reading large blocks -- bigger than the in-core size? Loop and
+ // recurse.
+ //
+ if (num > maxnum) {
+ uint32 numread = 0;
+ uint32 pos = 0;
+ uint32 len = 0;
+
+ while (num > 0) {
+ len = MIN(maxnum, num);
+ len = getRecord((char *)record + pos * _recordSize, len);
+
+ if (len == 0)
+ return(numread);
+
+ num -= len;
+ pos += len;
+ numread += len;
+ }
+
+ return(numread);
+ }
+
+ // If asked to read too many records, read whatever is left.
+ //
+ if (_numRecords < _pos + _rec + num)
+ num = _numRecords - _pos - _rec;
+ if (_limit < _pos + _rec + num)
+ num = _limit - _pos - _rec;
+
+ // If the current position is already past eof, return without
+ // reading. The previous 'if' ensures we will never read a block
+ // past eof.
+ //
+ if ((_numRecords < _pos + _rec) || (_limit < _pos + _rec))
+ return(0);
+
+ if (_bfrmax < _rec + num + 1)
+ seek(_pos + _rec, true);
+
+ memcpy(record, _bfr + _rec * _recordSize, _recordSize * num);
+
+ _rec += num;
+
+ return(num);
+}
+
+
+
+void
+recordFile::putRecord(void *record, uint32 num) {
+ uint32 maxnum = _bfrmax / 2;
+
+ if (num > maxnum) {
+ uint32 pos = 0;
+ uint32 len = 0;
+
+ while (num > 0) {
+ len = MIN(maxnum, num);
+
+ putRecord((char *)record + pos * _recordSize, len);
+
+ num -= len;
+ pos += len;
+ }
+
+ } else {
+ if (_bfrmax < _rec + num + 1)
+ seek(_pos + _rec, true);
+
+ memcpy(_bfr + _rec * _recordSize, record, _recordSize * num);
+
+ _rec += num;
+ _numRecords += num;
+
+ _bfrDirty = true;
+ }
+}
diff --git a/libutil/recordFile.H b/libutil/recordFile.H
new file mode 100644
index 0000000..72c6aee
--- /dev/null
+++ b/libutil/recordFile.H
@@ -0,0 +1,65 @@
+#ifndef RECORDFILE_H
+#define RECORDFILE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "util.h"
+
+// A file of fixed size records, with an optional header at the
+// start. Derived from the bitPackedFile at SVN-1533, but heavily
+// modified. Records can only be added, not updated (probably
+// trivial to fix). Records must be dense (also probably trivial to
+// fix).
+
+class recordFile {
+public:
+ recordFile(char const *name,
+ uint32 headerSize,
+ uint32 recordSize,
+ char mode);
+ ~recordFile();
+
+ void *header(void) { return(_header); };
+
+ uint64 numRecords(void) { return(_numRecords); };
+
+ // Read/write records.
+ uint32 getRecord(void *record, uint32 num=1);
+ void putRecord(void *record, uint32 num=1);
+
+ // Seek to record rec, optionally repositioning the buffer to that
+ // record.
+ void seek(uint64 rec, bool forced=false);
+
+ // Set an artificial EOF at record rec.
+ void limit(uint64 rec) { _limit = rec; };
+
+private:
+ void flushDirty(void);
+
+ int _file;
+ char *_name;
+
+ uint64 _numRecords;
+ uint32 _recordSize;
+
+ uint32 _headerSize;
+ char *_header;
+
+ uint64 _bfrmax; // Number of records in the buffer
+ char *_bfr; // A chunk of the bitPackedFile in core
+
+ uint64 _limit; // An artificial EOF
+
+ uint64 _pos; // The location this chunk is from (in records)
+ uint64 _rec; // The record we're modifying relative to _pos
+
+ bool _bfrDirty;
+ bool _isReadOnly;
+};
+
+#endif // RECORDFILE_H
+
+
+
diff --git a/libutil/speedCounter.C b/libutil/speedCounter.C
new file mode 100644
index 0000000..b9e4568
--- /dev/null
+++ b/libutil/speedCounter.C
@@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util++.H"
+
+const char*
+speedCounter::_spinr[4] = { "[|]", "[/]", "[-]", "[\\]" };
+
+const char*
+speedCounter::_liner[19] = { "[- ]",
+ "[-- ]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ --]",
+ "[ -]",
+ "[ --]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ -- ]",
+ "[ -- ]" };
+
+
+speedCounter::speedCounter(char const *fmt,
+ double unit,
+ uint64 freq,
+ bool enabled) {
+ _count = 0;
+ _draws = 0;
+ _unit = unit;
+ _freq = freq;
+ _startTime = getTime();
+ _fmt = fmt;
+ _spin = false;
+ _line = false;
+ _enabled = enabled;
+
+ // We use _draws instead of shifting _count just because it's
+ // simpler, and both methods need another variable anyway.
+
+ // Set all the bits below the hightest set in _freq --
+ // this allows us to do a super-fast test in tick().
+ //
+ _freq |= _freq >> 1;
+ _freq |= _freq >> 2;
+ _freq |= _freq >> 4;
+ _freq |= _freq >> 8;
+ _freq |= _freq >> 16;
+ _freq |= _freq >> 32;
+}
+
+speedCounter::~speedCounter() {
+ finish();
+}
diff --git a/libutil/speedCounter.H b/libutil/speedCounter.H
new file mode 100644
index 0000000..b7e5ae9
--- /dev/null
+++ b/libutil/speedCounter.H
@@ -0,0 +1,77 @@
+#ifndef SPEEDCOUNTER_H
+#define SPEEDCOUNTER_H
+
+#include <stdio.h>
+
+class speedCounter {
+public:
+ // fmt specifies the status format. An example:
+ // " %8f [unit]things (%8.5f [unit]things/sec)\r"
+ //
+ speedCounter(char const *fmt,
+ double unit,
+ uint64 freq,
+ bool enabled=true);
+ ~speedCounter();
+
+ void enableSpinner(void) { _spin = true; };
+ void enableLiner(void) { _line = true; };
+
+ bool tick(void) {
+ if (_enabled && ((++_count & _freq) == uint64ZERO)) {
+ double v = _count / _unit;
+ if (_spin) fputs(_spinr[_draws % 4], stderr);
+ if (_line) fputs(_liner[_draws % 19], stderr);
+ _draws++;
+ fprintf(stderr, _fmt, v, v / (getTime() - _startTime));
+ fflush(stderr);
+ return(true);
+ }
+ return(false);
+ };
+
+ bool tick(uint64 increment) {
+ if (_enabled == false)
+ return(false);
+
+ _count += increment;
+ if ((_count & _freq) == uint64ZERO) {
+ double v = _count / _unit;
+ if (_spin) fputs(_spinr[_draws % 4], stderr);
+ if (_line) fputs(_liner[_draws % 19], stderr);
+ _draws++;
+ fprintf(stderr, _fmt, v, v / (getTime() - _startTime));
+ fflush(stderr);
+ return(true);
+ }
+ return(false);
+ };
+
+ void finish(void) {
+ if (_enabled && (_count >= _freq)) {
+ double v = _count / _unit;
+ if (_spin) fputs(_spinr[_draws % 4], stderr);
+ if (_line) fputs(_liner[_draws % 19], stderr);
+ fprintf(stderr, _fmt, v, v / (getTime() - _startTime));
+ fprintf(stderr, "\n");
+ fflush(stderr);
+ }
+ _count = 0;
+ };
+
+private:
+ static const char *_spinr[4];
+ static const char *_liner[19];
+ uint64 _count;
+ uint64 _draws;
+ double _unit;
+ uint64 _freq;
+ double _startTime;
+ char const *_fmt;
+ bool _spin;
+ bool _line;
+ bool _enabled;
+};
+
+
+#endif // SPEEDCOUNTER_H
diff --git a/libutil/splitToWords.H b/libutil/splitToWords.H
new file mode 100644
index 0000000..32ebc49
--- /dev/null
+++ b/libutil/splitToWords.H
@@ -0,0 +1,117 @@
+#ifndef SPLITTOWORDS_H
+#define SPLITTOWORDS_H
+
+
+class splitToWords {
+public:
+ splitToWords() {
+ _argWords = 0;
+ _maxWords = 0;
+ _arg = 0L;
+ _maxChars = 0;
+ _cmd = 0L;
+ };
+ splitToWords(char *cmd) {
+ _argWords = 0;
+ _maxWords = 0;
+ _arg = 0L;
+ _maxChars = 0;
+ _cmd = 0L;
+
+ split(cmd);
+ };
+ ~splitToWords() {
+ delete [] _cmd;
+ delete [] _arg;
+ };
+
+
+ void split(char *cmd) {
+
+ // Step Zero:
+ //
+ // Count the length of the string, in words and in characters.
+ // For simplicity, we overcount words, by just counting white-space.
+ //
+ // Then, allocate space for a temporary copy of the string, and a
+ // set of pointers into the temporary copy (much like argv).
+ //
+ uint32 cmdChars = 1; // 1 == Space for terminating 0
+ uint32 cmdWords = 2; // 2 == Space for first word and terminating 0L
+
+ for (char *tmp=cmd; *tmp; tmp++) {
+ cmdWords += *tmp == ' ';
+ cmdWords += *tmp == '\t';
+ cmdChars++;
+ }
+
+ if (cmdChars > _maxChars) {
+ delete [] _cmd;
+ _cmd = new char [cmdChars];
+ _maxChars = cmdChars;
+ }
+ if (cmdWords > _maxWords) {
+ delete [] _arg;
+ _arg = new char * [cmdWords];
+ _maxWords = cmdWords;
+ }
+
+ _argWords = 0;
+
+ // Step One:
+ //
+ // Determine where the words are in the command string, copying the
+ // string to _cmd and storing words in _arg.
+ //
+ bool isFirst = true;
+ char *cmdI = cmd;
+ char *cmdO = _cmd;
+
+ while (*cmdI) {
+
+ // If we are at a non-space character, we are in a word. If
+ // this is the first character in the word, save the word in
+ // the args list.
+ //
+ // Otherwise we are at a space and thus not in a word. Make
+ // all spaces be string terminators, and declare that we are
+ // at the start of a word.
+ //
+ if ((*cmdI != ' ') && (*cmdI != '\t')) {
+ *cmdO = *cmdI;
+
+ if (isFirst) {
+ _arg[_argWords++] = cmdO;
+ isFirst = false;
+ }
+ } else {
+ *cmdO = 0;
+ isFirst = true;
+ }
+
+ cmdI++;
+ cmdO++;
+ }
+
+ // Finish off the list by terminating the last arg, and
+ // terminating the list of args.
+ //
+ *cmdO = 0;
+ _arg[_argWords] = 0L;
+ };
+
+
+ uint32 numWords(void) { return(_argWords); };
+ char *getWord(uint32 i) { return(_arg[i]); };
+ char *operator[](uint32 i) { return(_arg[i]); };
+ int64 operator()(uint32 i) { return(strtoull(_arg[i], NULL, 10)); };
+private:
+ uint32 _argWords;
+ uint32 _maxWords;
+ char **_arg;
+ uint32 _maxChars;
+ char *_cmd;
+};
+
+
+#endif // SPLITTOWORDS_H
diff --git a/libutil/sweatShop.C b/libutil/sweatShop.C
new file mode 100644
index 0000000..855744a
--- /dev/null
+++ b/libutil/sweatShop.C
@@ -0,0 +1,587 @@
+#include "sweatShop.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include <sched.h> // pthread scheduling stuff
+
+
+
+class sweatShopWorker {
+public:
+ sweatShopWorker() {
+ shop = 0L;
+ threadUserData = 0L;
+ numComputed = 0;
+ workerQueue = 0L;
+ workerQueueLen = 0L;
+ };
+
+ sweatShop *shop;
+ void *threadUserData;
+ pthread_t threadID;
+ uint32 numComputed;
+ sweatShopState **workerQueue;
+ uint32 workerQueueLen;
+};
+
+
+// This gets created by the loader, passed to the worker, and printed
+// by the writer. userData is controlled by the user.
+//
+class sweatShopState {
+public:
+ sweatShopState(void *userData) {
+ _user = userData;
+ _computed = false;
+ _next = 0L;
+ };
+ ~sweatShopState() {
+ };
+
+ void *_user;
+ bool _computed;
+ sweatShopState *_next;
+};
+
+
+
+
+// Simply forwards control to the class
+void*
+_sweatshop_loaderThread(void *ss_) {
+ sweatShop *ss = (sweatShop *)ss_;
+ return(ss->loader());
+}
+
+void*
+_sweatshop_workerThread(void *sw_) {
+ sweatShopWorker *sw = (sweatShopWorker *)sw_;
+ return(sw->shop->worker(sw));
+}
+
+void*
+_sweatshop_writerThread(void *ss_) {
+ sweatShop *ss = (sweatShop *)ss_;
+ return(ss->writer());
+}
+
+void*
+_sweatshop_statusThread(void *ss_) {
+ sweatShop *ss = (sweatShop *)ss_;
+ return(ss->status());
+}
+
+
+
+sweatShop::sweatShop(void*(*loaderfcn)(void *G),
+ void (*workerfcn)(void *G, void *T, void *S),
+ void (*writerfcn)(void *G, void *S)) {
+
+ _userLoader = loaderfcn;
+ _userWorker = workerfcn;
+ _userWriter = writerfcn;
+
+ _globalUserData = 0L;
+
+ _writerP = 0L;
+ _workerP = 0L;
+ _loaderP = 0L;
+
+ _showStatus = false;
+
+ _loaderQueueSize = 1024;
+ _loaderQueueMax = 10240;
+ _loaderQueueMin = 4; // _numberOfWorkers * 2, reset when that changes
+ _loaderBatchSize = 1;
+ _workerBatchSize = 1;
+ _writerQueueSize = 4096;
+ _writerQueueMax = 10240;
+
+ _numberOfWorkers = 2;
+
+ _workerData = 0L;
+
+ _numberLoaded = 0;
+ _numberComputed = 0;
+ _numberOutput = 0;
+}
+
+
+sweatShop::~sweatShop() {
+ delete [] _workerData;
+}
+
+
+
+void
+sweatShop::setThreadData(uint32 t, void *x) {
+ if (_workerData == 0L)
+ _workerData = new sweatShopWorker [_numberOfWorkers];
+
+ if (t >= _numberOfWorkers)
+ fprintf(stderr, "sweatShop::setThreadData()-- worker ID "uint32FMT" more than number of workers="uint32FMT"\n", t, _numberOfWorkers), exit(1);
+
+ _workerData[t].threadUserData = x;
+}
+
+
+
+// Build a list of states to add in one swoop
+//
+void
+sweatShop::loaderSave(sweatShopState *&tail, sweatShopState *&head, sweatShopState *thisState) {
+
+ thisState->_next = 0L;
+
+ if (tail) {
+ head->_next = thisState;
+ head = thisState;
+ } else {
+ tail = head = thisState;
+ }
+ _numberLoaded++;
+}
+
+
+// Add a bunch of new states to the queue.
+//
+void
+sweatShop::loaderAppend(sweatShopState *&tail, sweatShopState *&head) {
+ int err;
+
+ if ((tail == 0L) || (head == 0L))
+ return;
+
+ err = pthread_mutex_lock(&_stateMutex);
+ if (err != 0)
+ fprintf(stderr, "sweatShop::loaderAppend()-- Failed to lock mutex (%d). Fail.\n", err), exit(1);
+
+ if (_loaderP == 0L) {
+ _writerP = tail;
+ _workerP = tail;
+ _loaderP = head;
+ } else {
+ _loaderP->_next = tail;
+ }
+ _loaderP = head;
+
+ err = pthread_mutex_unlock(&_stateMutex);
+ if (err != 0)
+ fprintf(stderr, "sweatShop::loaderAppend()-- Failed to unlock mutex (%d). Fail.\n", err), exit(1);
+
+ tail = 0L;
+ head = 0L;
+}
+
+
+
+void*
+sweatShop::loader(void) {
+
+ struct timespec naptime;
+ naptime.tv_sec = 0;
+ naptime.tv_nsec = 166666666ULL; // 1/6 second
+
+ // We can batch several loads together before we push them onto the
+ // queue, this should reduce the number of times the loader needs to
+ // lock the queue.
+ //
+ // But it also increases the latency, so it's disabled by default.
+ //
+ sweatShopState *tail = 0L; // The first thing loaded
+ sweatShopState *head = 0L; // The last thing loaded
+ uint32 numLoaded = 0;
+
+ bool moreToLoad = true;
+
+ while (moreToLoad) {
+
+ // Zzzzzzz....
+ while (_numberLoaded > _numberComputed + _loaderQueueSize)
+ nanosleep(&naptime, 0L);
+
+ sweatShopState *thisState = new sweatShopState((*_userLoader)(_globalUserData));
+
+ // If we actually loaded a new state, add it
+ //
+ if (thisState->_user) {
+ loaderSave(tail, head, thisState);
+ numLoaded++;
+ if (numLoaded >= _loaderBatchSize)
+ loaderAppend(tail, head);
+ } else {
+ // Didn't read, must be all done! Push on the end-of-input marker state.
+ //
+ loaderSave(tail, head, new sweatShopState(0L));
+ loaderAppend(tail, head);
+
+ moreToLoad = false;
+ delete thisState;
+ }
+ }
+
+ //fprintf(stderr, "sweatShop::reader exits.\n");
+ return(0L);
+}
+
+
+
+void*
+sweatShop::worker(sweatShopWorker *workerData) {
+
+ struct timespec naptime;
+ naptime.tv_sec = 0;
+ naptime.tv_nsec = 50000000ULL;
+
+ bool moreToCompute = true;
+ int err;
+
+ while (moreToCompute) {
+
+ // Usually beacuse some worker is taking a long time, and the
+ // output queue isn't big enough.
+ //
+ while (_numberOutput + _writerQueueSize < _numberComputed)
+ nanosleep(&naptime, 0L);
+
+ // Grab the next state. We don't grab it if it's the last in the
+ // queue (else we would fall off the end) UNLESS it really is the
+ // last one.
+ //
+ err = pthread_mutex_lock(&_stateMutex);
+ if (err != 0)
+ fprintf(stderr, "sweatShop::worker()-- Failed to lock mutex (%d). Fail.\n", err), exit(1);
+
+ for (workerData->workerQueueLen = 0; ((workerData->workerQueueLen < _workerBatchSize) &&
+ (_workerP) &&
+ ((_workerP->_next != 0L) || (_workerP->_user == 0L))); workerData->workerQueueLen++) {
+ workerData->workerQueue[workerData->workerQueueLen] = _workerP;
+ _workerP = _workerP->_next;
+ }
+
+ if (_workerP == 0L)
+ moreToCompute = false;
+
+ err = pthread_mutex_unlock(&_stateMutex);
+ if (err != 0)
+ fprintf(stderr, "sweatShop::worler()-- Failed to lock mutex (%d). Fail.\n", err), exit(1);
+
+
+ if (workerData->workerQueueLen == 0) {
+ // No work, sleep a bit to prevent thrashing the mutex and resume.
+ nanosleep(&naptime, 0L);
+ continue;
+ }
+
+ // Execute
+ //
+ for (uint32 x=0; x<workerData->workerQueueLen; x++) {
+ sweatShopState *ts = workerData->workerQueue[x];
+
+ if (ts && ts->_user) {
+ (*_userWorker)(_globalUserData, workerData->threadUserData, ts->_user);
+ ts->_computed = true;
+ workerData->numComputed++;
+ } else {
+ // When we really do run out of stuff to do, we'll end up here
+ // (only one thread will end up in the other case, with
+ // something to do and moreToCompute=false). If it's actually
+ // the end, skip the sleep and just get outta here.
+ //
+ if (moreToCompute == true) {
+ fprintf(stderr, "WARNING! Worker is sleeping because the reader is slow!\n");
+ nanosleep(&naptime, 0L);
+ }
+ }
+ }
+ }
+
+ //fprintf(stderr, "sweatShop::worker exits.\n");
+ return(0L);
+}
+
+
+void*
+sweatShop::writer(void) {
+ sweatShopState *deleteState = 0L;
+
+ // Wait for output to appear, then write.
+ //
+ while (_writerP && _writerP->_user) {
+
+ if (_writerP->_computed == false) {
+ // Wait for a slow computation.
+ struct timespec naptime;
+ naptime.tv_sec = 0;
+ naptime.tv_nsec = 5000000ULL;
+
+ //fprintf(stderr, "Writer waits for slow thread at "uint64FMT".\n", _numberOutput);
+ nanosleep(&naptime, 0L);
+ } else if (_writerP->_next == 0L) {
+ // Wait for the input.
+ struct timespec naptime;
+ naptime.tv_sec = 0;
+ naptime.tv_nsec = 5000000ULL;
+
+ //fprintf(stderr, "Writer waits for all threads at "uint64FMT".\n", _numberOutput);
+ nanosleep(&naptime, 0L);
+ } else {
+ (*_userWriter)(_globalUserData, _writerP->_user);
+ _numberOutput++;
+
+ deleteState = _writerP;
+ _writerP = _writerP->_next;
+ delete deleteState;
+ }
+ }
+
+ // Tell status to stop.
+ _writerP = 0L;
+
+ //fprintf(stderr, "sweatShop::writer exits.\n");
+ return(0L);
+}
+
+
+// This thread not only shows a status message, but it also updates the critical shared variable
+// _numberComputed. Worker threads use this to throttle themselves. Thus, even if _showStatus is
+// not set, and this thread doesn't _appear_ to be doing anything useful....it is.
+//
+void*
+sweatShop::status(void) {
+
+ struct timespec naptime;
+ naptime.tv_sec = 0;
+ naptime.tv_nsec = 250000000ULL;
+
+ double startTime = getTime() - 0.001;
+ double thisTime = 0;
+
+ uint64 deltaOut = 0;
+ uint64 deltaCPU = 0;
+
+ double cpuPerSec = 0;
+
+ uint64 readjustAt = 16384;
+
+ while (_writerP) {
+ uint32 nc = 0;
+ for (uint32 i=0; i<_numberOfWorkers; i++)
+ nc += _workerData[i].numComputed;
+ _numberComputed = nc;
+
+ deltaOut = deltaCPU = 0;
+
+ thisTime = getTime();
+
+ if (_numberComputed > _numberOutput)
+ deltaOut = _numberComputed - _numberOutput;
+ if (_numberLoaded > _numberComputed)
+ deltaCPU = _numberLoaded - _numberComputed;
+
+ cpuPerSec = _numberComputed / (thisTime - startTime);
+
+ if (_showStatus) {
+ fprintf(stderr, " %6.1f/s - "uint64FMTW(8)" loaded; "uint64FMTW(8)" queued for compute; "uint64FMTW(8)" finished; "uint64FMTW(8)" written; "uint64FMTW(8)" queued for output)\r",
+ cpuPerSec, _numberLoaded, deltaCPU, _numberComputed, _numberOutput, deltaOut);
+ fflush(stderr);
+ }
+
+ // Readjust queue sizes based on current performance, but don't let it get too big or small.
+ // In particular, don't let it get below 2*numberOfWorkers.
+ //
+ if (_numberComputed > readjustAt) {
+ readjustAt += (uint64)(2 * cpuPerSec);
+ _loaderQueueSize = (uint32)(5 * cpuPerSec);
+ }
+
+ if (_loaderQueueSize < _loaderQueueMin)
+ _loaderQueueSize = _loaderQueueMin;
+
+ if (_loaderQueueSize < 2 * _numberOfWorkers)
+ _loaderQueueSize = 2 * _numberOfWorkers;
+
+ if (_loaderQueueSize > _loaderQueueMax)
+ _loaderQueueSize = _loaderQueueMax;
+
+ nanosleep(&naptime, 0L);
+ }
+
+ if (_showStatus) {
+ thisTime = getTime();
+
+ if (_numberComputed > _numberOutput)
+ deltaOut = _numberComputed - _numberOutput;
+ if (_numberLoaded > _numberComputed)
+ deltaCPU = _numberLoaded - _numberComputed;
+
+ cpuPerSec = _numberComputed / (thisTime - startTime);
+
+ fprintf(stderr, " %6.1f/s - "uint64FMTW(8)" queued for compute; "uint64FMTW(8)" finished; "uint64FMTW(8)" queued for output)\n",
+ cpuPerSec, deltaCPU, _numberComputed, deltaOut);
+ }
+
+ //fprintf(stderr, "sweatShop::status exits.\n");
+ return(0L);
+}
+
+
+
+
+
+void
+sweatShop::run(void *user, bool beVerbose) {
+ pthread_attr_t threadAttr;
+ pthread_t threadIDloader;
+ pthread_t threadIDwriter;
+ pthread_t threadIDstats;
+#if 0
+ int threadSchedPolicy = 0;
+ struct sched_param threadSchedParamDef;
+ struct sched_param threadSchedParamMax;
+#endif
+ int err = 0;
+
+ _globalUserData = user;
+ _showStatus = beVerbose;
+
+ // Configure everything ahead of time.
+
+ if (_workerBatchSize < 1)
+ _workerBatchSize = 1;
+
+ if (_workerData == 0L)
+ _workerData = new sweatShopWorker [_numberOfWorkers];
+
+ for (uint32 i=0; i<_numberOfWorkers; i++) {
+ _workerData[i].shop = this;
+ _workerData[i].workerQueue = new sweatShopState * [_workerBatchSize];
+ }
+
+ // Open the doors.
+
+ errno = 0;
+
+ err = pthread_mutex_init(&_stateMutex, NULL);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to configure pthreads (state mutex): %s.\n", strerror(err)), exit(1);
+
+ err = pthread_attr_init(&threadAttr);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to configure pthreads (attr init): %s.\n", strerror(err)), exit(1);
+
+ err = pthread_attr_setscope(&threadAttr, PTHREAD_SCOPE_SYSTEM);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to configure pthreads (set scope): %s.\n", strerror(err)), exit(1);
+
+ err = pthread_attr_setdetachstate(&threadAttr, PTHREAD_CREATE_JOINABLE);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to configure pthreads (joinable): %s.\n", strerror(err)), exit(1);
+
+#if 0
+ err = pthread_attr_getschedparam(&threadAttr, &threadSchedParamDef);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to configure pthreads (get default param): %s.\n", strerror(err)), exit(1);
+
+ err = pthread_attr_getschedparam(&threadAttr, &threadSchedParamMax);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to configure pthreads (get max param): %s.\n", strerror(err)), exit(1);
+#endif
+
+ // SCHED_RR needs root privs to run on FreeBSD.
+ //
+ //err = pthread_attr_setschedpolicy(&threadAttr, SCHED_RR);
+ //if (err)
+ // fprintf(stderr, "sweatShop::run()-- Failed to configure pthreads (sched policy): %s.\n", strerror(err)), exit(1);
+
+#if 0
+ err = pthread_attr_getschedpolicy(&threadAttr, &threadSchedPolicy);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to configure pthreads (sched policy): %s.\n", strerror(err)), exit(1);
+
+ errno = 0;
+ threadSchedParamMax.sched_priority = sched_get_priority_max(threadSchedPolicy);
+ if (errno)
+ fprintf(stderr, "sweatShop::run()-- WARNING: Failed to configure pthreads (set max param priority): %s.\n", strerror(errno));
+
+ // Fire off the loader
+
+ err = pthread_attr_setschedparam(&threadAttr, &threadSchedParamMax);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to set loader priority: %s.\n", strerror(err)), exit(1);
+#endif
+
+ err = pthread_create(&threadIDloader, &threadAttr, _sweatshop_loaderThread, this);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to launch loader thread: %s.\n", strerror(err)), exit(1);
+
+ // Wait for it to actually load something (otherwise all the
+ // workers immediately go home)
+
+ while (!_writerP && !_workerP && !_loaderP) {
+ struct timespec naptime;
+ naptime.tv_sec = 0;
+ naptime.tv_nsec = 250000ULL;
+ nanosleep(&naptime, 0L);
+ }
+
+ // Start the statistics and writer
+
+#if 0
+ err = pthread_attr_setschedparam(&threadAttr, &threadSchedParamMax);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to set status and writer priority: %s.\n", strerror(err)), exit(1);
+#endif
+
+ err = pthread_create(&threadIDstats, &threadAttr, _sweatshop_statusThread, this);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to launch status thread: %s.\n", strerror(err)), exit(1);
+
+ err = pthread_create(&threadIDwriter, &threadAttr, _sweatshop_writerThread, this);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to launch writer thread: %s.\n", strerror(err)), exit(1);
+
+ // And some labor
+
+#if 0
+ err = pthread_attr_setschedparam(&threadAttr, &threadSchedParamDef);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to set worker priority: %s.\n", strerror(err)), exit(1);
+#endif
+
+ for (uint32 i=0; i<_numberOfWorkers; i++) {
+ err = pthread_create(&_workerData[i].threadID, &threadAttr, _sweatshop_workerThread, _workerData + i);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to launch worker thread "uint32FMT": %s.\n", i, strerror(err)), exit(1);
+ }
+
+ // Now sit back and relax.
+
+ err = pthread_join(threadIDloader, 0L);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to join loader thread: %s.\n", strerror(err)), exit(1);
+
+ err = pthread_join(threadIDwriter, 0L);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to join writer thread: %s.\n", strerror(err)), exit(1);
+
+ err = pthread_join(threadIDstats, 0L);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to join status thread: %s.\n", strerror(err)), exit(1);
+
+ for (uint32 i=0; i<_numberOfWorkers; i++) {
+ err = pthread_join(_workerData[i].threadID, 0L);
+ if (err)
+ fprintf(stderr, "sweatShop::run()-- Failed to join worker thread "uint32FMT": %s.\n", i, strerror(err)), exit(1);
+ }
+
+ // Cleanup.
+
+ delete _loaderP;
+ _loaderP = _workerP = _writerP = 0L;
+}
diff --git a/libutil/sweatShop.H b/libutil/sweatShop.H
new file mode 100644
index 0000000..ef1bf2c
--- /dev/null
+++ b/libutil/sweatShop.H
@@ -0,0 +1,81 @@
+#ifndef SWEATSHOP_H
+#define SWEATSHOP_H
+
+#include <pthread.h>
+#include <semaphore.h>
+
+#include "util++.H"
+
+class sweatShopWorker;
+class sweatShopState;
+
+class sweatShop {
+public:
+ sweatShop(void*(*loaderfcn)(void *G),
+ void (*workerfcn)(void *G, void *T, void *S),
+ void (*writerfcn)(void *G, void *S));
+ ~sweatShop();
+
+ void setNumberOfWorkers(uint32 x) {
+ _numberOfWorkers = x;
+ _loaderQueueMin = x * 2;
+ };
+
+ void setThreadData(uint32 t, void *x);
+
+ void setLoaderBatchSize(uint32 batchSize) { _loaderBatchSize = batchSize; };
+ void setLoaderQueueSize(uint32 queueSize) { _loaderQueueSize = queueSize; _loaderQueueMax = queueSize; };
+
+ void setWorkerBatchSize(uint32 batchSize) { _workerBatchSize = batchSize; };
+
+ void setWriterQueueSize(uint32 queueSize) { _writerQueueSize = queueSize; _writerQueueMax = queueSize; };
+
+ void run(void *user=0L, bool beVerbose=false);
+private:
+
+ // Stubs that forward control from the c-based pthread to this class
+ friend void *_sweatshop_loaderThread(void *ss);
+ friend void *_sweatshop_workerThread(void *ss);
+ friend void *_sweatshop_writerThread(void *ss);
+ friend void *_sweatshop_statusThread(void *ss);
+
+ // The threaded routines
+ void *loader(void);
+ void *worker(sweatShopWorker *workerData);
+ void *writer(void);
+ void *status(void);
+
+ // Utilities for the loader thread
+ //void loaderAdd(sweatShopState *thisState);
+ void loaderSave(sweatShopState *&tail, sweatShopState *&head, sweatShopState *thisState);
+ void loaderAppend(sweatShopState *&tail, sweatShopState *&head);
+
+ pthread_mutex_t _stateMutex;
+
+ void *(*_userLoader)(void *global);
+ void (*_userWorker)(void *global, void *thread, void *thing);
+ void (*_userWriter)(void *global, void *thing);
+
+ void *_globalUserData;
+
+ sweatShopState *_writerP; // Where output takes stuff from, the tail
+ sweatShopState *_workerP; // Where computes happen, the middle
+ sweatShopState *_loaderP; // Where input is put, the head
+
+ bool _showStatus;
+
+ uint32 _loaderQueueSize, _loaderQueueMin, _loaderQueueMax;
+ uint32 _loaderBatchSize;
+ uint32 _workerBatchSize;
+ uint32 _writerQueueSize, _writerQueueMax;
+
+ uint32 _numberOfWorkers;
+
+ sweatShopWorker *_workerData;
+
+ uint64 _numberLoaded;
+ uint64 _numberComputed;
+ uint64 _numberOutput;
+};
+
+#endif // SWEATSHOP_H
diff --git a/libutil/test/Makefile b/libutil/test/Makefile
new file mode 100644
index 0000000..c686ee1
--- /dev/null
+++ b/libutil/test/Makefile
@@ -0,0 +1,106 @@
+PROG = test-bigQueue \
+ test-bitPackedArray \
+ test-bitPackedFile \
+ test-bitPacking \
+ test-freeDiskSpace \
+ test-intervalList \
+ test-logMsg \
+ test-md5 \
+ test-mmap \
+ test-palloc \
+ test-readBuffer \
+ test-recordFile \
+ test-types \
+ tcat
+
+# Broken, don't test.
+#test-bzipBuffer
+
+INCLUDE = -I..
+LIBS = -L.. -lutil -lm
+OBJS =
+
+include ../../Make.compilers
+
+all: $(PROG)
+ @echo Tests passed!
+
+test-bigQueue: test-bigQueue.C ../libutil.a ../util.h ../util++.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-bigQueue.o test-bigQueue.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-bigQueue test-bigQueue.o $(LIBS)
+ time ./test-bigQueue
+ time sort -k2n < junk-bigQueue-out-2 > junk-bigQueue-out-2.s
+ diff junk-bigQueue-out-1 junk-bigQueue-out-2.s
+
+test-bitPackedArray: test-bitPackedArray.C ../libutil.a ../util.h ../util++.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-bitPackedArray.o test-bitPackedArray.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-bitPackedArray test-bitPackedArray.o $(LIBS)
+ ./test-bitPackedArray
+
+test-bitPackedFile: test-bitPackedFile.C ../libutil.a ../util.h ../util++.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-bitPackedFile.o test-bitPackedFile.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-bitPackedFile test-bitPackedFile.o $(LIBS) -lbz2
+ ./test-bitPackedFile
+
+test-bitPacking: test-bitPacking.C ../libutil.a ../util.h ../util++.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-bitPacking.o test-bitPacking.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-bitPacking test-bitPacking.o $(LIBS)
+ ./test-bitPacking
+
+test-freeDiskSpace: test-freeDiskSpace.c ../libutil.a
+ $(CC) $(CFLAGS_COMPILE) -c -o test-freeDiskSpace.o test-freeDiskSpace.c $(INCLUDE)
+ $(CC) $(CLDFLAGS) -o test-freeDiskSpace test-freeDiskSpace.o $(LIBS)
+ ./test-freeDiskSpace test-freeDiskSpace.c
+
+test-intervalList: test-intervalList.C ../libutil.a ../util.h ../util++.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-intervalList.o test-intervalList.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-intervalList test-intervalList.o $(LIBS)
+ ./test-intervalList
+
+test-logMsg: test-logMsg.C ../libutil.a ../util.h ../util++.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-logMsg.o test-logMsg.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-logMsg test-logMsg.o $(LIBS)
+ ./test-logMsg | wc
+
+test-md5: test-md5.c ../libutil.a ../util.h ../util++.H
+ $(CC) $(CFLAGS_COMPILE) -c -o test-md5.o test-md5.c $(INCLUDE)
+ $(CC) $(CLDFLAGS) -o test-md5 test-md5.o $(LIBS)
+ ./test-md5
+
+test-mmap: test-mmap.c ../libutil.a ../util.h ../util++.H
+ $(CC) $(CFLAGS_COMPILE) -c -o test-mmap.o test-mmap.c $(INCLUDE)
+ $(CC) $(CLDFLAGS) -o test-mmap test-mmap.o $(LIBS)
+ ./test-mmap 16
+
+test-palloc: test-palloc.c ../libutil.a ../util.h ../util++.H
+ $(CC) $(CFLAGS_COMPILE) -c -o test-palloc.o test-palloc.c $(INCLUDE)
+ $(CC) $(CLDFLAGS) -o test-palloc test-palloc.o $(LIBS)
+ ./test-palloc
+
+test-readBuffer: test-readBuffer.C ../libutil.a ../util.h ../util++.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-readBuffer.o test-readBuffer.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-readBuffer test-readBuffer.o $(LIBS)
+ ./test-readBuffer test-readBuffer
+
+test-recordFile: test-recordFile.C ../libutil.a ../util.h ../util++.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-recordFile.o test-recordFile.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-recordFile test-recordFile.o $(LIBS)
+ ./test-recordFile
+
+test-bzipBuffer: test-bzipBuffer.C ../libutil.a ../util.h ../util++.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o test-bzipBuffer.o test-bzipBuffer.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o test-bzipBuffer test-bzipBuffer.o -lbz2 $(LIBS)
+ bzip2 -9vc ./test-bzipBuffer > ./test-bzipBuffer.bz2
+ ./test-bzipBuffer ./test-bzipBuffer.bz2
+
+test-types: test-types.c ../libutil.a ../util.h ../util++.H
+ $(CC) $(CFLAGS_COMPILE) -c -o test-types.o test-types.c $(INCLUDE)
+ $(CC) $(CLDFLAGS) -o test-types test-types.o $(LIBS)
+ ./test-types
+
+tcat: tcat.C ../libutil.a ../util.h ../util++.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o tcat.o tcat.C $(INCLUDE) $(THREADS)
+ $(CXX) $(CXXLDFLAGS) -o tcat tcat.o $(LIBS) $(THREADL)
+
+clean:
+ rm -f $(PROG) *.o *junk*
diff --git a/libutil/test/atomic.C b/libutil/test/atomic.C
new file mode 100644
index 0000000..c9acd3e
--- /dev/null
+++ b/libutil/test/atomic.C
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <math.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include <pthread.h>
+#include <semaphore.h>
+
+// Tests if add is atomic.
+
+long int count = 0;
+long int counts[8] = { 0 };
+pthread_t threadID[8];
+
+double
+getTime(void) {
+ struct timeval tp;
+ gettimeofday(&tp, NULL);
+ return(tp.tv_sec + (double)tp.tv_usec / 1000000.0);
+}
+
+void*
+workerThread(void *idx) {
+ long int &c = (*(long int *)idx);
+
+ double start = getTime();
+ while (getTime() - start < 5) {
+ c++; count++;
+ c++; count++;
+ c++; count++;
+ c++; count++;
+ c++; count++;
+ c++; count++;
+ }
+}
+
+int
+main(int argc, char **argv) {
+ pthread_attr_t threadAttr;
+
+ pthread_attr_init(&threadAttr);
+ pthread_attr_setscope(&threadAttr, PTHREAD_SCOPE_SYSTEM);
+ pthread_attr_setschedpolicy(&threadAttr, SCHED_OTHER);
+
+ int numThreads = 5;
+ int sum = 0;
+
+ for (int i=0; i<numThreads; i++)
+ pthread_create(threadID+i, &threadAttr, workerThread, (void *)(counts + i));
+
+ for (int i=0; i<numThreads; i++) {
+ pthread_join(threadID[i], 0L);
+ fprintf(stderr, "thread %2ld %ld\n", i, counts[i]);
+ sum += counts[i];
+ }
+
+ fprintf(stderr, "total sum: %ld total global: %ld\n", sum, count);
+}
+
+
diff --git a/libutil/test/endianess.c b/libutil/test/endianess.c
new file mode 100644
index 0000000..a3e891b
--- /dev/null
+++ b/libutil/test/endianess.c
@@ -0,0 +1,124 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/param.h> // BYTE_ORDER
+
+#include "/home/work/src/genomics/libutil/util.h"
+
+// Reports the byte order, writes words to files for testing.
+
+#if 0
+
+begin 644 test-alpha
+.`@$$`P(!"`<&!00#`@$`
+`
+end
+begin 644 test-i386
+.`@$$`P(!"`<&!00#`@$`
+`
+end
+begin 644 test-opteron
+.`@$$`P(!"`<&!00#`@$`
+`
+end
+begin 644 test-power
+.`0(!`@,$`0(#!`4&!P@`
+`
+end
+
+#endif
+
+
+int
+isBig1(void) {
+ uint64 l = uint64ONE;
+
+ if (*((char *)(&l)) == 1)
+ return(0);
+ return(1);
+}
+
+
+// supposedly due to Harbison and Steele
+int
+isBig2(void) {
+ union {
+ uint64 l;
+ char c[sizeof(uint64)];
+ } u;
+
+ u.l = uint64ONE;
+
+#if 0
+ fprintf(stderr, "%d%d%d%d%d%d%d%d\n",
+ u.c[0], u.c[1], u.c[2], u.c[3],
+ u.c[4], u.c[5], u.c[6], u.c[7]);
+#endif
+
+ if (u.c[0] == 1) // LSB is first
+ return(0);
+ return(1); // MSB is first
+}
+
+
+
+int
+main(int argc, char **argv) {
+ uint16 u16 = 0x0102;
+ uint32 u32 = uint32NUMBER(0x01020304);
+ uint64 u64 = uint64NUMBER(0x0102030405060708);
+
+ fprintf(stderr, "BYTE_ORDER = %d\n", BYTE_ORDER);
+
+ fprintf(stderr, " BIG_ENDIAN = %d\n", BIG_ENDIAN);
+ fprintf(stderr, " LITTLE_ENDIAN = %d\n", LITTLE_ENDIAN);
+ fprintf(stderr, " PDP_ENDIAN = %d\n", PDP_ENDIAN);
+
+ fprintf(stderr, "isBig1() = %d\n", isBig1());
+ fprintf(stderr, "isBig2() = %d\n", isBig2());
+
+ if (argc == 1) {
+ fprintf(stderr, "usage: %s [ write | read ] < source > check\n", argv[0]);
+ exit(1);
+ }
+
+ if (strcmp(argv[1], "write") == 0) {
+ fwrite(&u16, sizeof(uint16), 1, stdout);
+ fwrite(&u32, sizeof(uint32), 1, stdout);
+ fwrite(&u64, sizeof(uint64), 1, stdout);
+ return(0);
+ }
+
+ fread(&u16, sizeof(uint16), 1, stdin);
+ fread(&u32, sizeof(uint32), 1, stdin);
+ fread(&u64, sizeof(uint64), 1, stdin);
+
+#if 0
+ // swap bytes to convert u16
+ u16 = (((u16 >> 8) & 0x00ff) |
+ ((u16 << 8) & 0xff00));
+
+ // swap bytes, then swap words to convert u32
+ u32 = (((u32 >> 24) & 0x000000ff) |
+ ((u32 >> 8) & 0x0000ff00) |
+ ((u32 << 8) & 0x00ff0000) |
+ ((u32 << 24) & 0xff000000));
+
+ // swap bytes, then flip words [0<->3, 1<->2] to convert u64
+ u64 = (((u64 >> 24) & uint64NUMBER(0x000000ff000000ff)) |
+ ((u64 >> 8) & uint64NUMBER(0x0000ff000000ff00)) |
+ ((u64 << 8) & uint64NUMBER(0x00ff000000ff0000)) |
+ ((u64 << 24) & uint64NUMBER(0xff000000ff000000)));
+ u64 = (((u64 >> 32) & uint64NUMBER(0x00000000ffffffff)) |
+ ((u64 << 32) & uint64NUMBER(0xffffffff00000000)));
+#endif
+
+ if (u16 != 0x1234)
+ fprintf(stderr, "u16 -- 0x%04x correct=0x%04x\n", u16, 0x1234);
+ if (u32 != 0x12345678)
+ fprintf(stderr, "u32 -- "uint32HEX" correct="uint32HEX"\n", u32, 0x12345678);
+ if (u64 != uint64NUMBER(0x1234567890abcdef))
+ fprintf(stderr, "u64 -- "uint64HEX" correct="uint64HEX"\n", u64, uint64NUMBER(0x1234567890abcdef));
+
+ return(0);
+}
+
diff --git a/libutil/test/order.C b/libutil/test/order.C
new file mode 100644
index 0000000..11b5c13
--- /dev/null
+++ b/libutil/test/order.C
@@ -0,0 +1,84 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../util.h"
+
+//#include <sys/param.h>
+
+union u64 {
+ uint64 u;
+ unsigned char c[8];
+};
+
+union u32 {
+ uint32 u;
+ unsigned char c[4];
+};
+
+union u16 {
+ uint16 u;
+ unsigned char c[2];
+};
+
+
+uint64
+uint64Swap(uint64 x) {
+ x = ((x >> 8) & uint64NUMBER(0x00ff00ff00ff00ff)) | ((x << 8) & uint64NUMBER(0xff00ff00ff00ff00));
+ x = ((x >> 16) & uint64NUMBER(0x0000ffff0000ffff)) | ((x << 16) & uint64NUMBER(0xffff0000ffff0000));
+ x = ((x >> 32) & uint64NUMBER(0x00000000ffffffff)) | ((x << 32) & uint64NUMBER(0xffffffff00000000));
+ return(x);
+}
+
+uint32
+uint32Swap(uint32 x) {
+ x = ((x >> 8) & uint32NUMBER(0x00ff00ff)) | ((x << 8) & uint32NUMBER(0xff00ff00));
+ x = ((x >> 16) & uint32NUMBER(0x0000ffff)) | ((x << 16) & uint32NUMBER(0xffff0000));
+ return(x);
+}
+
+uint16
+uint16Swap(uint16 x) {
+ x = ((x >> 8) & 0x00ff) | ((x << 8) & 0xff00);
+ return(x);
+}
+
+
+
+int
+main(int argc, char **argv) {
+ u64 u64v;
+ u32 u32v;
+ u16 u16v;
+
+ u64v.u = 0x1234567890abcdefLLU;
+ u32v.u = 0x12345678;
+ u16v.u = 0x1234;
+
+ for (int i=0; i<8; i++)
+ fprintf(stderr, "%02x", u64v.c[i]);
+ fprintf(stderr, "\n");
+
+ for (int i=0; i<4; i++)
+ fprintf(stderr, "%02x", u32v.c[i]);
+ fprintf(stderr, "\n");
+
+ for (int i=0; i<2; i++)
+ fprintf(stderr, "%02x", u16v.c[i]);
+ fprintf(stderr, "\n");
+
+ u64v.u = uint64Swap(u64v.u);
+ u32v.u = uint32Swap(u32v.u);
+ u16v.u = uint16Swap(u16v.u);
+
+ for (int i=0; i<8; i++)
+ fprintf(stderr, "%02x", u64v.c[i]);
+ fprintf(stderr, "\n");
+
+ for (int i=0; i<4; i++)
+ fprintf(stderr, "%02x", u32v.c[i]);
+ fprintf(stderr, "\n");
+
+ for (int i=0; i<2; i++)
+ fprintf(stderr, "%02x", u16v.c[i]);
+ fprintf(stderr, "\n");
+}
diff --git a/libutil/test/tcat.C b/libutil/test/tcat.C
new file mode 100644
index 0000000..b2bcdc2
--- /dev/null
+++ b/libutil/test/tcat.C
@@ -0,0 +1,86 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <math.h>
+#include <time.h>
+#include "sweatShop.H"
+
+// Reads stdin, writes stdout. Uses threads.
+
+int blockSize = 8192;
+
+struct tcat_s {
+ int dataLen;
+ char *data;
+};
+
+void*
+tcatReader(void *) {
+ tcat_s *s = new tcat_s;
+
+ s->data = new char [blockSize];
+ s->dataLen = safeRead(STDIN_FILENO, s->data, "tcatReader", sizeof(char) * blockSize);
+
+ if (s->dataLen == 0) {
+ delete [] s->data;
+ delete s;
+ return(0L);
+ }
+
+ return(s);
+}
+
+void
+tcatWorker(void *, void *, void *) {
+ // Noop!
+}
+
+void
+tcatWriter(void *, void *S) {
+ tcat_s *s = (tcat_s *)S;
+
+ safeWrite(STDOUT_FILENO, s->data, "tcatWriter", sizeof(char) * s->dataLen);
+
+ delete [] s->data;
+ delete s;
+}
+
+
+int
+main(int argc, char **argv) {
+ int readBuf = 64;
+ int writBuf = 64;
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-r") == 0) {
+ readBuf = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-w") == 0) {
+ writBuf = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-b") == 0) {
+ blockSize = atoi(argv[++arg]);
+
+ } else {
+ fprintf(stderr, "unknown option '%s'\n", argv[arg]);
+ err++;
+ }
+
+ arg++;
+ }
+ if (err) {
+ fprintf(stderr, "usage: %s [-b blockSizeBytes] [-r readBufferSizeMB] [-w writeBufferSizeMB]\n", argv[0]);
+ exit(1);
+ }
+
+ sweatShop *ss = new sweatShop(tcatReader, tcatWorker, tcatWriter);
+
+ ss->setLoaderQueueSize(readBuf * 1024 * 1024 / blockSize);
+ ss->setWriterQueueSize(writBuf * 1024 * 1024 / blockSize);
+
+ ss->run();
+
+ exit(0);
+}
diff --git a/libutil/test/test-bigQueue.C b/libutil/test/test-bigQueue.C
new file mode 100644
index 0000000..b62a1ae
--- /dev/null
+++ b/libutil/test/test-bigQueue.C
@@ -0,0 +1,72 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "util++.H"
+
+//
+// mbri && CC -g -o test-bigQueue test-bigQueue.C -L. -lutil && ./bigQueue-test | & more
+//
+
+struct thing_s {
+ int a;
+ int b;
+ double c;
+ int d;
+};
+
+
+int
+sortthing(const void *a, const void *b) {
+ thing_s *A = *((thing_s **)a);
+ thing_s *B = *((thing_s **)b);
+
+ if (A->a < B->a)
+ return(-1);
+ if (A->a > B->a)
+ return(1);
+ if (A->b < B->b)
+ return(-1);
+ if (A->b > B->b)
+ return(1);
+ return(0);
+}
+
+
+int
+main(int argc, char **argv) {
+ bigQueue *T = new bigQueue(sortthing, 0L, 0L, 0L, sizeof(thing_s), 1, 0L);
+
+ mt_s *mtctx = mtInit(3);
+
+ int testSize = 2000000;
+
+ FILE *out = fopen("junk-bigQueue-out-1", "w");
+
+ for (int i=0; i<testSize; i++) {
+ thing_s *t = new thing_s;
+ t->a = mtRandom32(mtctx) / 4;
+ t->b = i;
+ t->c = (double)i;
+ t->d = -i;
+
+ fprintf(out, "%012d %08d %12.3f %08d\n", t->a, t->b, t->c, t->d);
+
+ T->add(t);
+ }
+
+ fclose(out);
+ out = fopen("junk-bigQueue-out-2", "w");
+
+ T->sort();
+
+ while (T->next()) {
+ thing_s *t = (thing_s *)T->get();
+
+ fprintf(out, "%012d %08d %12.3f %08d\n", t->a, t->b, t->c, t->d);
+ }
+
+ delete T;
+
+ fclose(out);
+}
diff --git a/libutil/test/test-bitPackedArray.C b/libutil/test/test-bitPackedArray.C
new file mode 100644
index 0000000..e31417e
--- /dev/null
+++ b/libutil/test/test-bitPackedArray.C
@@ -0,0 +1,152 @@
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+
+#include "util++.H"
+
+uint32 wordSize = 41;
+uint32 testSize = 1 * 1024 * 1024;
+uint32 arrySize = 1 * 1024 * 1024;
+
+int
+uint64compare(const void *a, const void *b) {
+ const uint64 A = *(const uint64 *)a;
+ const uint64 B = *(const uint64 *)b;
+ if (A<B) return(-1);
+ if (A>B) return(1);
+ return(0);
+}
+
+int
+main(int argc, char **argv) {
+
+ mt_s *mtctx = mtInit(time(NULL));
+
+ // Test the bitPackedArray by writing a bunch of random gibberish
+ // to it, and see if it's the same.
+
+ uint32 *pos = new uint32 [testSize];
+ uint64 *val = new uint64 [testSize];
+ uint64 *ans = new uint64 [arrySize];
+
+ bitPackedArray *ARR = new bitPackedArray(wordSize, 16);
+ uint32 fail = uint32ZERO;
+
+#if 1
+ fprintf(stderr, "Touching the end of the array and clearing.\n");
+ //ARR->set(arrySize, 0);
+ //ARR->clear();
+
+ fprintf(stderr, "Generating random test data.\n");
+
+ // Hit every element first, just to do it
+ for (uint32 i=0; i<arrySize; i++) {
+ pos[i] = i;
+ val[i] = mtRandom64(mtctx);
+ val[i] &= uint64MASK(wordSize);
+ ans[pos[i]] = val[i];
+ }
+
+ // Then hit random elements, with replacement, looking for bugs
+ for (uint32 i=arrySize; i<testSize; i++) {
+ pos[i] = mtRandom32(mtctx) % arrySize;
+ val[i] = mtRandom64(mtctx);
+ val[i] &= uint64MASK(wordSize);
+ ans[pos[i]] = val[i];
+ }
+
+ fprintf(stderr, "Filling array.\n");
+
+ for (uint32 i=0; i<testSize; i++)
+ ARR->set(pos[i], val[i]);
+
+ fprintf(stderr, "Validating array.\n");
+
+ for (uint32 i=0; i<arrySize; i++)
+ if (ARR->get(i) != ans[i]) {
+ fprintf(stderr, "FAIL at i="uint32FMT"\n", i);
+ fail++;
+
+ if (fail > 1024) {
+ fprintf(stderr, "bitPackedArray has errors, aborting!\n");
+ return(1);
+ }
+ }
+
+ if (fail) {
+ fprintf(stderr, "bitPackedArray had "uint32FMT" errors.\n", fail);
+ return(1);
+ }
+
+ fprintf(stderr, "OK!\n");
+#endif
+
+ delete ARR;
+ delete [] pos;
+ delete [] val;
+ delete [] ans;
+
+ //
+ //
+ //
+
+ for (uint32 testNum=0; testNum<32; testNum++) {
+ uint32 thisTestSize = 0;
+ uint32 thisWordSize = 0;
+
+ // Test a BIG heap the first iteration.
+ if (testNum == 0) {
+ thisTestSize = 857353; //23987153;
+ thisWordSize = 63;
+
+ fprintf(stderr, "Building heap "uint32FMT" (wordsize="uint32FMT" testsize="uint32FMT").\n",
+ testNum, thisWordSize, thisTestSize);
+ } else {
+ thisTestSize = (mtRandom64(mtctx) % (2 * testNum)) * 1024 + 1024;
+ thisWordSize = (mtRandom64(mtctx) % 63) + 1;
+ }
+
+ uint32 blockSize = mtRandom64(mtctx) % 32 + 1;
+ bitPackedHeap *HEAP = new bitPackedHeap(thisWordSize, blockSize);
+
+ val = new uint64 [thisTestSize];
+ for (uint32 i=0; i<thisTestSize; i++) {
+ val[i] = mtRandom64(mtctx);
+ val[i] &= uint64MASK(thisWordSize);
+ HEAP->add(val[i]);
+ }
+
+ fprintf(stderr, "Testing heap "uint32FMT" (wordsize="uint32FMT" testsize="uint32FMT").\n",
+ testNum, thisWordSize, thisTestSize);
+
+ qsort(val, thisTestSize, sizeof(uint64), uint64compare);
+
+ for (uint32 i=0; i<thisTestSize; i++) {
+ uint64 h = HEAP->get();
+
+ //fprintf(stderr, "val["uint32FMT"]="uint64FMT" -- HEAP="uint64FMT"\n", i, val[i], h);
+
+ if (val[i] != h) {
+ fprintf(stderr, "val["uint32FMT"]="uint64FMT" !! HEAP="uint64FMT"\n", i, val[i], h);
+ fail++;
+ if (fail > 25) {
+ fprintf(stderr, "bitPackedHeap has errors, aborting!\n");
+ return(1);
+ }
+ }
+ }
+
+ if (fail) {
+ fprintf(stderr, "bitPackedHeap had "uint32FMT" errors.!\n", fail);
+ return(1);
+ }
+
+ delete HEAP;
+ delete [] val;
+ }
+
+ fprintf(stderr, "OK!\n");
+
+ return(fail);
+}
+
diff --git a/libutil/test/test-bitPackedFile.C b/libutil/test/test-bitPackedFile.C
new file mode 100644
index 0000000..ea0c824
--- /dev/null
+++ b/libutil/test/test-bitPackedFile.C
@@ -0,0 +1,271 @@
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+
+#include "util++.H"
+
+// This will perform various tests on the bitPackedFile class,
+// returning 0 if OK and 1 if error.
+//
+// testSize -- the number of words to use in a write then read test
+// testIter -- the number of random access tests to do
+
+uint32 testSize = 2000000;
+uint32 testIter = 50;
+
+mt_s *mtctx;
+
+// Generate a list of random 64-bit numbers, remember the number and the size
+//
+void
+generateRandom(uint32 *siz, uint64 *val) {
+
+ for (uint32 i=0; i<testSize; i++) {
+#if 0
+ // For debugging
+ siz[i] = 13;
+ val[i] = (i % 2) ? uint64ZERO : ~uint64ZERO;
+#else
+ siz[i] = (mtRandom32(mtctx) % 63) + 1;
+ val[i] = mtRandom64(mtctx);
+#endif
+ val[i] &= uint64MASK(siz[i]);
+ }
+}
+
+
+void
+testStreaming(void) {
+ bitPackedFile *F = 0L;
+ uint32 i;
+ uint32 *siz = new uint32 [testSize];
+ uint64 *val = new uint64 [testSize];
+ uint32 errs = 0;
+
+ generateRandom(siz, val);
+
+ // Write those numbers to a bitPackedFile, both binary encoded and
+ // fibonacci encoded.
+ //
+ F = new bitPackedFile("bittest.junk");
+ for (i=0; i<testSize; i++) {
+ F->putBits(val[i], siz[i]);
+ F->putNumber(val[i]);
+ }
+ delete F;
+
+ // Open the file and check what we just wrote.
+ //
+ F = new bitPackedFile("bittest.junk");
+ for (i=0; i<testSize; i++) {
+ uint64 v;
+
+ v = F->getBits(siz[i]);
+ if (v != val[i]) {
+ fprintf(stderr, uint32FMT"] ERROR in getBits() -- retrieved "uint64HEX" != expected "uint64HEX" ("uint32FMT" bits).\n", i, v, val[i], siz[i]);
+ errs++;
+ }
+
+ v = F->getNumber();
+ if (v != val[i]) {
+ fprintf(stderr, uint32FMT"] ERROR in getNumber() -- retrieved "uint64HEX" != expected "uint64HEX".\n", i, v, val[i]);
+ errs++;
+ }
+ }
+ delete F;
+
+ delete [] val;
+ delete [] siz;
+
+ if (errs > 0) {
+ fprintf(stderr, "There are "uint32FMT" errors in the stream test.\n", errs);
+ exit(1);
+ } else {
+ fprintf(stderr, "The stream test PASSED.\n");
+ }
+
+ unlink("bittest.junk");
+}
+
+
+void
+testRandomReading(bool inCore) {
+ bitPackedFile *F = 0L;
+ uint32 i;
+ uint32 *siz = new uint32 [testSize + 1];
+ uint64 *val = new uint64 [testSize];
+ uint32 errs = 0;
+
+ fprintf(stderr, "BUILDING random test set.\n");
+ generateRandom(siz, val);
+
+ // Create a new bitpacked file, writing just numbers as binary encoded.
+ //
+ fprintf(stderr, "SAVING random test set.\n");
+ F = new bitPackedFile("bittest.junk");
+ for (i=0; i<testSize; i++)
+ F->putBits(val[i], siz[i]);
+ delete F;
+
+ // Covert the siz[] into offsets
+ //
+ uint32 t = siz[0];
+ siz[0] = 0;
+ for (uint32 i=1; i<testSize; i++) {
+ uint32 x = siz[i];
+ siz[i] = t;
+ t += x;
+ }
+ siz[testSize] = t;
+
+ // Attempt to flush memory
+ //
+ {
+ uint32 ll = 400 * 1024 * 1024 / 8;
+ uint64 *xx = new uint64 [ll];
+ xx[0] = 1;
+ xx[1] = 1;
+ for (uint32 i=2; i<ll; i++)
+ xx[i] = xx[i-1] + xx[i-2];
+ fprintf(stdout, "FLUSHED: "uint32FMT"\n", xx[ll-1]);
+ delete [] xx;
+ }
+
+ // Do several seek tests. Seek to a random element, and read it.
+ //
+ F = new bitPackedFile("bittest.junk");
+
+ if (inCore) {
+ F->loadInCore();
+ fprintf(stderr, "Begin INCORE seek test!\n");
+ } else {
+ fprintf(stderr, "Begin DISKBASED seek test!\n");
+ }
+
+ double startTime = getTime();
+
+ for (i=0; i<testIter; i++) {
+ uint32 idx = (uint32)lrand48() % testSize;
+
+ F->seek(siz[idx]);
+ uint64 r = F->getBits(siz[idx+1] - siz[idx]);
+
+ if (r != val[idx]) {
+ fprintf(stderr, uint32FMT"] ERROR in seek()/getBits() -- retrieved "uint64HEX" != expected "uint64HEX" ("uint32FMT" bits).\n", i, r, val[i], siz[i]);
+ errs++;
+ }
+ }
+ delete F;
+
+ if (errs > 0) {
+ fprintf(stderr, "There are "uint32FMT" errors in the %s random access.\n", errs, (inCore) ? "inCore" : "disk");
+ exit(1);
+ } else {
+ fprintf(stderr, "The %s seek test PASSED (%f seconds).\n",
+ (inCore) ? "inCore" : "disk",
+ getTime() - startTime);
+ }
+
+ delete [] val;
+ delete [] siz;
+
+ unlink("bittest.junk");
+}
+
+
+
+
+
+void
+testReWrite(void) {
+ bitPackedFile *F = 0L;
+ uint32 i;
+ uint32 *siz = new uint32 [testSize];
+ uint64 *val = new uint64 [testSize];
+ uint32 errs = 0;
+ uint64 pos = uint64ZERO;
+
+ generateRandom(siz, val);
+
+ // First, write zeros to the file
+ //
+ F = new bitPackedFile("bittest.junk");
+ for (i=0; i<testSize; i++)
+ F->putBits(uint64ZERO, siz[i]);
+ delete F;
+
+ fprintf(stderr, "WRITING FORWARDS!\n");
+
+ // Now, write every other number to the file
+ //
+ F = new bitPackedFile("bittest.junk");
+ for (i=0; i<testSize; i++) {
+ if ((i % 2) == 1) {
+ F->seek(pos);
+ F->putBits(val[i], siz[i]);
+ }
+ pos += siz[i];
+ }
+ F->showStats(stderr);
+ delete F;
+
+ fprintf(stderr, "WRITING BACKWARDS!\n");
+
+ // And go backwards and write the other set of numbers to the file
+ //
+ F = new bitPackedFile("bittest.junk");
+ for (i=testSize; i--; ) {
+ pos -= siz[i];
+ if ((i % 2) == 0) {
+ F->seek(pos);
+ F->putBits(val[i], siz[i]);
+ }
+ }
+ F->showStats(stderr);
+ delete F;
+
+ // Now, stream through the file and see if we wrote what we should have
+ //
+ F = new bitPackedFile("bittest.junk");
+ for (i=0; i<testSize; i++) {
+ uint64 v;
+
+ v = F->getBits(siz[i]);
+ if (v != val[i]) {
+ fprintf(stderr, uint32FMT"] ERROR in seekstream/getBits() -- retrieved "uint64HEX" != expected "uint64HEX" ("uint32FMT" bits).\n", i, v, val[i], siz[i]);
+ errs++;
+ }
+ }
+ F->showStats(stderr);
+ delete F;
+
+ delete [] val;
+ delete [] siz;
+
+ if (errs > 0) {
+ fprintf(stderr, "There are "uint32FMT" errors in the rewrite test.\n", errs);
+ exit(1);
+ } else {
+ fprintf(stderr, "The rewrite test PASSED.\n");
+ }
+
+ unlink("bittest.junk");
+}
+
+
+
+int
+main(int argc, char **argv) {
+
+ mtctx = mtInit(time(NULL));
+
+ testSize = 30000000;
+ testIter = 2000;
+ //testStreaming();
+ //testReWrite();
+
+ testSize = 40000000;
+ testIter = 10000;
+ testRandomReading(false);
+ testRandomReading(true);
+}
diff --git a/libutil/test/test-bitPacking.C b/libutil/test/test-bitPacking.C
new file mode 100644
index 0000000..82cff83
--- /dev/null
+++ b/libutil/test/test-bitPacking.C
@@ -0,0 +1,337 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "util++.H"
+
+// An integer multiplier on the test length. 1 is pretty quick, but
+// 10 is the default.
+//
+#define TEST_LENGTH (10 * 1024 * 1024)
+
+// We test
+//
+// 1) binary encoding/decoding
+//
+// 2) pre/post increment of binary encoding
+//
+// 3) Perform some testing on the fibonacci encoded bit-packed stream
+// -- encode a bunch of random 64-bit numbers, make sure we can
+// decode back to the same number.
+//
+// NOTES: pre/post increment/decrement work modulo whatever size they
+// are. So, if you have a 6-bit value of zero, and you decrement,
+// you end up with a 6-bit value of all 1's, or 63.
+
+
+void
+testBinaryEncoding(void) {
+ time_t mtseed = time(0L);
+ mt_s *mtctx = 0L;
+
+ uint32 iterations = TEST_LENGTH;
+
+ uint64 *bits = new uint64 [iterations + 2];
+ uint64 bpos = uint64ZERO;
+
+ uint64 *V = new uint64 [iterations];
+ uint64 *C = new uint64 [iterations];
+ uint64 *S = new uint64 [iterations];
+
+ uint32 failed = 0;
+ uint32 errors = 0;
+
+ fprintf(stderr, "Starting test of binary encoding\n");
+
+ bpos = uint64ZERO;
+ mtctx = mtInit(mtseed);
+
+ // Build some values to stuff into the bits
+
+ for (uint32 j=0; j < iterations; j++) {
+ S[j] = (mtRandom32(mtctx) % 63) + 1;
+ V[j] = mtRandom64(mtctx) & uint64MASK(S[j]);
+ //fprintf(stderr, "[%2d] S="uint64FMT" V="uint64HEX"\n", j, S[j], V[j]);
+ }
+
+ // Stuff them in, in blocks of some size. At the same time, decode
+ // (this has found bugs in the past).
+
+ failed = 0;
+ for (uint32 j=0; j < iterations; ) {
+ uint64 num = (mtRandom32(mtctx) % 8);
+
+ if (j + num > iterations)
+ num = iterations - j;
+
+ if (num == 0) {
+ setDecodedValue(bits, bpos, S[j], V[j]);
+ C[j] = getDecodedValue(bits, bpos, S[j]);
+ //fprintf(stderr, "[%2d] V="uint64HEX" C="uint64HEX" single\n", j, V[j], C[j]);
+ bpos += S[j];
+ } else {
+ uint64 newp1 = setDecodedValues(bits, bpos, num, S+j, V+j);
+ uint64 newp2 = getDecodedValues(bits, bpos, num, S+j, C+j);
+
+ if (newp1 != newp2) {
+ // not perfect; we should be checking the values too, but we do that later.
+ for (uint32 x=0; x<num; x++)
+ fprintf(stderr, "[%2d] #1 V="uint64HEX" C="uint64HEX" multiple "uint32FMT" %s\n",
+ j+x, V[j+x], C[j+x], num, (V[j+x] == C[j+x]) ? "" : "FAILED");
+ failed++;
+ }
+
+ bpos = newp2;
+ }
+
+ j += num;
+ if (num == 0)
+ j++;
+ }
+ if (failed) {
+ fprintf(stderr, "binEncoding #1 failed encoding "uint32FMT" times.\n", failed);
+ errors++;
+ }
+
+ // Check that V == C
+
+ failed = 0;
+ for (uint32 j=0; j<iterations; j++) {
+ if (V[j] != C[j]) {
+ fprintf(stderr, "[%2d] #2 V="uint64HEX" C="uint64HEX" S="uint32FMT"\n",
+ j, V[j], C[j], S[j]);
+ failed++;
+ }
+ }
+ if (failed) {
+ fprintf(stderr, "binEncoding #2 failed encode/decode "uint32FMT" times.\n", failed);
+ errors++;
+ }
+
+
+ // Decode independently, with different nums
+
+ bpos = 0; // reset to start of bits
+
+ for (uint32 j=0; j < iterations; ) {
+ uint64 num = (mtRandom32(mtctx) % 8);
+
+ if (j + num > iterations)
+ num = iterations - j;
+
+ if (num == 0) {
+ C[j] = getDecodedValue(bits, bpos, S[j]);
+ bpos += S[j];
+ } else {
+ bpos = getDecodedValues(bits, bpos, num, S+j, C+j);
+ }
+
+ j += num;
+ if (num == 0)
+ j++;
+ }
+
+ // Check that V == C
+
+ failed = 0;
+ for (uint32 j=0; j<iterations; j++) {
+ if (V[j] != C[j]) {
+ fprintf(stderr, "[%2d] #3 V="uint64HEX" C="uint64HEX" S="uint32FMT"\n",
+ j, V[j], C[j], S[j]);
+ failed++;
+ }
+ }
+ if (failed) {
+ fprintf(stderr, "binEncoding #3 failed decoding "uint32FMT" times.\n", failed);
+ errors++;
+ }
+
+ // Clean.
+
+ delete [] bits;
+ delete [] V;
+ delete [] C;
+ delete [] S;
+
+ if (errors)
+ exit(1);
+}
+
+
+
+
+void
+testBinaryEncodingPrePost(void) {
+ time_t mtseed = time(0L);
+ mt_s *mtctx = 0L;
+
+ uint32 iterations = TEST_LENGTH;
+
+ uint64 *bits = new uint64 [2 * iterations];
+ uint64 bpos = uint64ZERO;
+ uint32 siz1 = uint64ZERO;
+ uint64 val1 = uint64ZERO;
+ uint64 val2 = uint64ZERO;
+
+ fprintf(stderr, "Starting test of binary encoding pre/post increment\n");
+
+ bpos = uint64ZERO;
+ mtctx = mtInit(mtseed);
+
+ for (uint32 j=0; j < iterations; j++) {
+ siz1 = (mtRandom32(mtctx) % 63) + 1;
+ val1 = mtRandom64(mtctx) & uint64MASK(siz1);
+
+ setDecodedValue(bits, bpos, siz1, val1);
+
+ val2 = postDecrementDecodedValue(bits, bpos, siz1);
+ if (val2 != val1) {
+ fprintf(stderr, "postDec1 failed: got "uint64FMT" expected "uint64FMT" siz="uint32FMT"\n",
+ val2, val1, siz1);
+ exit(1);
+ }
+ val2 = getDecodedValue(bits, bpos, siz1) + 1;
+ val2 &= uint64MASK(siz1);
+ if (val2 != val1) {
+ fprintf(stderr, "postDec2 failed: got "uint64FMT" expected "uint64FMT" siz="uint32FMT"\n",
+ val2, val1, siz1);
+ exit(1);
+ }
+
+ val2 = preDecrementDecodedValue(bits, bpos, siz1) + 2;
+ val2 &= uint64MASK(siz1);
+ if (val2 != val1) {
+ fprintf(stderr, "preDec failed: got "uint64FMT" expected "uint64FMT" siz="uint32FMT"\n",
+ val2, val1, siz1);
+ exit(1);
+ }
+
+ val2 = postIncrementDecodedValue(bits, bpos, siz1) + 2;
+ val2 &= uint64MASK(siz1);
+ if (val2 != val1) {
+ fprintf(stderr, "postInc failed: got "uint64FMT" expected "uint64FMT"\n", val2+2, val1-2);
+ exit(1);
+ }
+ val2 = getDecodedValue(bits, bpos, siz1) + 1;
+ val2 &= uint64MASK(siz1);
+ if (val2 != val1) {
+ fprintf(stderr, "postInc2 failed: got "uint64FMT" expected "uint64FMT" siz="uint32FMT"\n",
+ val2, val1, siz1);
+ exit(1);
+ }
+
+ val2 = preIncrementDecodedValue(bits, bpos, siz1);
+ // Should be back to original value, so no mask
+ if (val2 != val1) {
+ fprintf(stderr, "preInc failed: got "uint64FMT" expected "uint64FMT"\n", val2, val1);
+ exit(1);
+ }
+
+ switch (j % 4) {
+ case 0:
+ val2 = postDecrementDecodedValue(bits, bpos, siz1);
+ break;
+ case 1:
+ val2 = preDecrementDecodedValue(bits, bpos, siz1);
+ break;
+ case 2:
+ val2 = postIncrementDecodedValue(bits, bpos, siz1);
+ break;
+ case 3:
+ val2 = preIncrementDecodedValue(bits, bpos, siz1);
+ break;
+ }
+
+ bpos += siz1;
+ }
+
+ bpos = uint64ZERO;
+ mtctx = mtInit(mtseed);
+
+ //for (j=0; j < iterations; j++) {
+ //}
+
+ delete [] bits;
+}
+
+
+
+
+
+
+void
+testFibonacciEncoding(void) {
+ time_t mtseed = time(0L);
+ mt_s *mtctx = 0L;
+
+ uint32 iterations = TEST_LENGTH / 4;
+
+ uint64 *bits = new uint64 [3 * iterations];
+ uint64 bpos = uint64ZERO;
+
+ uint32 failed = 0;
+ uint32 errors = 0;
+
+ fprintf(stderr, "Starting test of fibonacci encoding\n");
+
+ bpos = uint64ZERO;
+ mtctx = mtInit(mtseed);
+ failed = 0;
+
+ for (uint32 j=0; j < iterations; j++) {
+ uint64 siz1 = (mtRandom32(mtctx) % 63) + 1;
+ uint64 val1 = mtRandom64(mtctx) & uint64MASK(siz1);
+ uint64 siz2 = siz1;
+
+ setFibonacciEncodedNumber(bits, bpos, &siz1, val1);
+
+ uint64 val2 = getFibonacciEncodedNumber(bits, bpos, &siz2);
+
+ if ((val1 != val2) || (siz1 != siz2)) {
+ fprintf(stderr, "fibEnc #1 failed on "uint32FMT": got "uint64FMT" expected "uint64FMT"\n", j, val2, val1);
+ failed++;
+ }
+ bpos += siz1;
+ }
+ if (failed) {
+ fprintf(stderr, "fibEnc #1 failed "uint32FMT" times.\n", failed);
+ errors++;
+ }
+
+ bpos = uint64ZERO;
+ mtctx = mtInit(mtseed);
+ failed = 0;
+
+ for (uint32 j=0; j < iterations; j++) {
+ uint64 siz1 = (mtRandom32(mtctx) % 63) + 1;
+ uint64 val1 = mtRandom64(mtctx) & uint64MASK(siz1);
+ uint64 val2 = getFibonacciEncodedNumber(bits, bpos, &siz1);
+
+ if (val1 != val2) {
+ fprintf(stderr, "fibEnc #2 failed on "uint32FMT": got "uint64FMT" expected "uint64FMT"\n", j, val2, val1);
+ failed++;
+ }
+ bpos += siz1;
+ }
+ if (failed) {
+ fprintf(stderr, "fibEnc #2 failed "uint32FMT" times.\n", failed);
+ errors++;
+ }
+
+ delete [] bits;
+
+ if (errors)
+ exit(1);
+}
+
+
+
+
+
+int
+main(int argc, char **argv) {
+ testBinaryEncoding();
+ testBinaryEncodingPrePost();
+ testFibonacciEncoding();
+ return(0);
+}
diff --git a/libutil/test/test-bzipBuffer.C b/libutil/test/test-bzipBuffer.C
new file mode 100644
index 0000000..f3e8836
--- /dev/null
+++ b/libutil/test/test-bzipBuffer.C
@@ -0,0 +1,110 @@
+#include <stdio.h>
+
+#include "util++.H"
+
+char *filename = 0L;
+md5_s *correct = 0L;
+
+int
+doTest(bzipBuffer *B, char *description) {
+ int error = 0;
+ md5_increment_s *testing = 0L;
+
+ while (!B->eof())
+ testing = md5_increment_char(testing, B->getnext());
+
+ md5_increment_finalize(testing);
+
+ if ((testing->a != correct->a) || (testing->b != correct->b)) {
+ fprintf(stderr, "bzipBuffer test %s failed.\n", description);
+ fprintf(stderr, "Got correct md5 of "uint64HEX" "uint64HEX"\n", correct->a, correct->b);
+ fprintf(stderr, "Got testing md5 of "uint64HEX" "uint64HEX"\n", testing->a, testing->b);
+ error = 1;
+ }
+
+ md5_increment_destroy(testing);
+
+ return(error);
+}
+
+int
+doTestRead(bzipBuffer *B, size_t bufferSize, char *description) {
+ int error = 0;
+ char *buffer = new char [bufferSize];
+ size_t bufferLen = 0;
+
+ md5_increment_s *testing = 0L;
+
+ while (!B->eof()) {
+ bufferLen = B->read(buffer, bufferSize);
+ testing = md5_increment_block(testing, buffer, bufferLen);
+ }
+
+ md5_increment_finalize(testing);
+
+ if ((testing->a != correct->a) || (testing->b != correct->b)) {
+ fprintf(stderr, "bzipBuffer test %s failed.\n", description);
+ fprintf(stderr, "Got correct md5 of "uint64HEX" "uint64HEX"\n", correct->a, correct->b);
+ fprintf(stderr, "Got testing md5 of "uint64HEX" "uint64HEX"\n", testing->a, testing->b);
+ error = 1;
+ }
+
+ md5_increment_destroy(testing);
+
+ return(error);
+}
+
+
+int
+main(int argc, char **argv) {
+ int error = 0;
+ bzipBuffer *B = 0L;
+
+ // If we are given a file, use that, otherwise, use ourself.
+ //
+ filename = argv[argc-1];
+
+
+ // Suck in the whole file, compute the correct md5 checksum on it
+ //
+ char *c = new char [sizeOfFile(filename)];
+ FILE *F = fopen(filename, "r");
+ fread(c, sizeof(char), sizeOfFile(filename), F);
+ fclose(F);
+ correct = md5_string(0L, c, sizeOfFile(filename));
+ delete [] c;
+
+
+ // Test just reading, with a small buffer
+ //
+ B = new bzipBuffer(filename, 999);
+ error += doTest(B, "#1 (read)");
+
+
+ exit(1);
+
+
+ // Test read() with a small buffer, reading large chunks
+ //
+ B = new bzipBuffer(filename, 100);
+ error += doTestRead(B, 10000, "#4 (read)");
+ delete B;
+
+
+ // Test read() with a small buffer, reading small chunks that are a
+ // factor of the buffersize.
+ //
+ B = new bzipBuffer(filename, 2000);
+ error += doTestRead(B, 1000, "#4 (read)");
+ delete B;
+
+
+ // Test read() with a large buffer, reading even larger pieces
+ //
+ B = new bzipBuffer(filename, sizeOfFile(filename));
+ error += doTestRead(B, sizeOfFile(filename) + 100000, "#5 (read)");
+ delete B;
+
+ return(error);
+}
+
diff --git a/libutil/test/test-freeDiskSpace.c b/libutil/test/test-freeDiskSpace.c
new file mode 100644
index 0000000..9e4b9b9
--- /dev/null
+++ b/libutil/test/test-freeDiskSpace.c
@@ -0,0 +1,16 @@
+#include "util.h"
+
+int
+main(int argc, char **argv) {
+ int i;
+
+ if (argc == 1) {
+ fprintf(stderr, "usage: %s file [...]\n", argv[0]);
+ exit(1);
+ }
+
+ for (i=1; i<argc; i++)
+ fprintf(stderr, "%s: %d\n", argv[i], (int)freeDiskSpace(argv[i]));
+
+ return(0);
+}
diff --git a/libutil/test/test-intervalList.C b/libutil/test/test-intervalList.C
new file mode 100644
index 0000000..381b225
--- /dev/null
+++ b/libutil/test/test-intervalList.C
@@ -0,0 +1,322 @@
+#include <stdio.h>
+
+#include "util++.H"
+
+mt_s *mt = 0L;
+
+void
+test(void) {
+ int e = 0;
+ intervalList<uint32> I;
+
+ I.add(71, 3);
+ I.add( 5, 3);
+ I.add(32, 5);
+ I.add(73, 3);
+ I.add(55, 10);
+ I.add( 5, 3);
+ I.add(10, 5);
+ I.add(20, 10);
+ I.add(30, 10);
+ I.add(50, 10);
+ I.add(70, 3);
+ I.add(72, 3);
+ I.add( 5, 3);
+ I.add(15, 5);
+
+#if 0
+ for (uint32 i=0; i<I.numberOfIntervals(); i++)
+ fprintf(stderr, "%2d] %2d %2d\n", i, I.lo(i), I.hi(i));
+#endif
+
+ I.sort();
+ I.merge();
+
+ if (I.sumOfLengths() != 54)
+ fprintf(stderr, "Lengths don't add up.\n"), e++;
+
+ if (I.numberOfIntervals() != 4)
+ fprintf(stderr, "Wrong number of intervals.\n"), e++;
+
+ if ((I.lo(0) != 5) || (I.hi(0) != 8))
+ fprintf(stderr, "Interval 0 is wrong.\n"), e++;
+ if ((I.lo(1) != 10) || (I.hi(1) != 40))
+ fprintf(stderr, "Interval 1 is wrong.\n"), e++;
+ if ((I.lo(2) != 50) || (I.hi(2) != 65))
+ fprintf(stderr, "Interval 2 is wrong.\n"), e++;
+ if ((I.lo(3) != 70) || (I.hi(3) != 76))
+ fprintf(stderr, "Interval 3 is wrong.\n"), e++;
+
+ if (e)
+ exit(e);
+}
+
+
+
+void
+testIntersect(uint32 type) {
+ uint32 numTests = 1000000;
+ uint32 *beg = new uint32 [numTests];
+ uint32 *len = new uint32 [numTests];
+ uint32 *end = new uint32 [numTests];
+ uint32 *abegh = new uint32 [numTests];
+ uint32 *aendh = new uint32 [numTests];
+ uint32 *bbegh = new uint32 [numTests];
+ uint32 *bendh = new uint32 [numTests];
+ uint32 errors = 0;
+ uint32 passed = 0;
+
+ intervalList<uint32> A;
+ intervalList<uint32> B;
+
+ //
+ // Build two interval lists
+ //
+ // type == 0 --> all pairwise
+ // type == 1 --> A sequence is solid
+ // type == 2 --> B sequence is solid
+ //
+
+ if (type == 1)
+ A.add(1, 1500000000);
+ if (type == 2)
+ B.add(1, 1500000000);
+
+ for (uint32 i=0; i<numTests; i++) {
+
+ // Compute the result we want to get
+ //
+ len[i] = mtRandom32(mt) % 200;
+ if (len[i] < 100) {
+ beg[i] = end[i] = mtRandom32(mt) % 100 + 100;
+ } else {
+ beg[i] = mtRandom32(mt) % 100 + 100;
+ end[i] = beg[i] + len[i];
+ }
+
+ // Reset if the type is 1 or 2.
+ //
+ if ((type == 1) || (type == 2)) {
+ len[i] = mtRandom32(mt) % 100 + 100;
+ beg[i] = mtRandom32(mt) % 100 + 100;
+ end[i] = beg[i] + len[i];
+ }
+
+ // Extend it to an interval -- we can extend exactly one end, or
+ // two opposite ends.
+ //
+ abegh[i] = 0;
+ aendh[i] = 0;
+ bbegh[i] = 0;
+ bendh[i] = 0;
+
+ if (type == 0) {
+ switch (mtRandom32(mt) % 8) {
+ case 0:
+ abegh[i] = mtRandom32(mt) % 50;
+ break;
+ case 1:
+ aendh[i] = mtRandom32(mt) % 50;
+ break;
+ case 2:
+ bbegh[i] = mtRandom32(mt) % 50;
+ break;
+ case 3:
+ bendh[i] = mtRandom32(mt) % 50;
+ break;
+ case 4:
+ abegh[i] = mtRandom32(mt) % 50;
+ aendh[i] = mtRandom32(mt) % 50;
+ break;
+ case 5:
+ bbegh[i] = mtRandom32(mt) % 50;
+ bendh[i] = mtRandom32(mt) % 50;
+ break;
+ case 6:
+ abegh[i] = mtRandom32(mt) % 50;
+ bendh[i] = mtRandom32(mt) % 50;
+ break;
+ case 7:
+ aendh[i] = mtRandom32(mt) % 50;
+ bbegh[i] = mtRandom32(mt) % 50;
+ break;
+ }
+ }
+
+ // Add it to the lists -- if type == 1 or 2, these should then
+ // get merged into the one big thing.
+ //
+ A.add(1000 * i + beg[i] - abegh[i], abegh[i] + end[i] - beg[i] + aendh[i]);
+ B.add(1000 * i + beg[i] - bbegh[i], bbegh[i] + end[i] - beg[i] + bendh[i]);
+ }
+
+ intervalList<uint32> I;
+ I.intersect(A, B);
+
+ //
+ // Check the result.
+ //
+
+ for (uint32 i=0, j=0; i<numTests; i++) {
+ uint32 b = I.lo(j) - 1000 * i;
+ uint32 e = I.hi(j) - 1000 * i;
+
+ if (len[i] < 100) {
+ //
+ // Expect no result here. We ca only test that the stuff that
+ // should be intersecting is correct, and if all that is, then
+ // I guess the non-intersection stuff is correct too.
+ //
+ } else {
+ if ((b != beg[i]) || (e != end[i])) {
+ fprintf(stderr, "FAILED[%4d]: "uint32FMT"-"uint32FMT" X "uint32FMT"-"uint32FMT" -> "uint32FMT","uint32FMT" ("uint32FMT","uint32FMT") (should have been "uint32FMT","uint32FMT")\n",
+ i,
+ beg[i] - abegh[i], beg[i] - abegh[i] + abegh[i] + end[i] - beg[i] + aendh[i],
+ beg[i] - bbegh[i], beg[i] - bbegh[i] + bbegh[i] + end[i] - beg[i] + bendh[i],
+ b, e, (uint32)I.lo(j), (uint32)I.hi(j),
+ beg[i], end[i]);
+ errors++;
+ } else {
+ passed++;
+ }
+ j++;
+ }
+ }
+
+ fprintf(stderr, "intersection test had "uint32FMT" successes and "uint32FMT" errors.\n", passed, errors);
+}
+
+
+
+
+void
+testMerge(void) {
+ intervalList<uint32,double> IL;
+ intervalList<uint32,double> ID;
+
+ // Test 1: one long sequence containing lots of little non-overlapping sequences
+ // Test 2: three long overlapping sequences, containing lots of non-overlapping sequences
+ // Test 3: dense random
+ // Test 4: special cases
+
+ fprintf(stderr, "Merge test 1\n");
+ IL.clear();
+ IL.add(0, 100000);
+ for (uint32 i=0; i<999; i++)
+ IL.add(100 + 100 * i, 50);
+ IL.merge();
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++)
+ fprintf(stderr, "IL["uint32FMTW(3)"] %6u %6u\n", i, IL.lo(i), IL.hi(i));
+
+ IL.clear();
+ for (uint32 i=0; i<999; i++)
+ IL.add(100 + 100 * i, 50);
+ IL.add(0, 100000);
+ IL.merge();
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++)
+ fprintf(stderr, "IL["uint32FMTW(3)"] %6u %6u\n", i, IL.lo(i), IL.hi(i));
+
+ fprintf(stderr, "Merge test 2\n");
+ IL.clear();
+ IL.add(0, 25000);
+ IL.add(25000, 25000);
+ IL.add(50000, 50000);
+ for (uint32 i=0; i<999; i++)
+ IL.add(100 + 100 * i, 50);
+ IL.merge();
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++)
+ fprintf(stderr, "IL["uint32FMTW(3)"] %6u %6u\n", i, IL.lo(i), IL.hi(i));
+
+ fprintf(stderr, "Merge test 3\n");
+ IL.clear();
+ uint32 lo = 200;
+ uint32 hi = 0;
+ for (uint32 i=0; i<999; i++) {
+ uint32 beg = mtRandom32(mt) % 100;
+ uint32 end = mtRandom32(mt) % 100 + 100;
+ if (beg < lo) lo = beg;
+ if (end > hi) hi = end;
+ IL.add(beg, end - beg);
+ }
+ IL.merge();
+ if ((IL.lo(0) != lo) || (IL.hi(0) != hi))
+ fprintf(stderr, "ERROR!\n");
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++)
+ fprintf(stderr, "IL["uint32FMTW(3)"] %6u %6u\n", i, IL.lo(i), IL.hi(i));
+
+ fprintf(stderr, "Merge test 4a\n");
+ IL.clear();
+ IL.add(0, 25000);
+ IL.add(25000, 25000);
+ IL.add(50000, 50000);
+ IL.merge();
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++)
+ fprintf(stderr, "IL["uint32FMTW(3)"] %6u %6u\n", i, IL.lo(i), IL.hi(i));
+
+ fprintf(stderr, "Merge test 4b\n");
+ IL.clear();
+ IL.add( 0, 25000, 1);
+ IL.add(25000, 25000, 2);
+ IL.add(50000, 50000, 4);
+ IL.add(20000, 5000, 8);
+ IL.add(45000, 5000, 16);
+ IL.add(95000, 5000, 32);
+ ID.depth(IL);
+ IL.merge();
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++)
+ fprintf(stderr, "IL["uint32FMTW(3)"] %6u %6u count %6u value %f\n", i, IL.lo(i), IL.hi(i), IL.count(i), IL.value(i));
+ for (uint32 i=0; i<ID.numberOfIntervals(); i++)
+ fprintf(stderr, "ID["uint32FMTW(3)"] %6u %6u depth %6u value %f\n", i, ID.lo(i), ID.hi(i), ID.count(i), ID.value(i));
+
+ fprintf(stderr, "Merge test 5\n");
+ IL.clear();
+ ID.clear();
+ IL.add( 0, 25000, 1);
+ IL.add(25000, 25000, 2);
+ IL.add(50000, 50000, 4);
+ IL.add(20000, 20000, 8);
+ IL.add(30000, 40000, 16);
+ IL.add(50000, 10000, 32);
+ //IL.add(10000, 90000, 32);
+ ID.depth(IL);
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++)
+ fprintf(stderr, "IL["uint32FMTW(3)"] %6u %6u count %6u value %f\n", i, IL.lo(i), IL.hi(i), IL.count(i), IL.value(i));
+ for (uint32 i=0; i<ID.numberOfIntervals(); i++)
+ fprintf(stderr, "ID["uint32FMTW(3)"] %6u %6u depth %6u value %f\n", i, ID.lo(i), ID.hi(i), ID.count(i), ID.value(i));
+
+ fprintf(stderr, "Merge test 6 (same as 5, but val = default)\n");
+ IL.clear();
+ ID.clear();
+ IL.add( 0, 25000);
+ IL.add(25000, 25000);
+ IL.add(50000, 50000);
+ IL.add(20000, 20000);
+ IL.add(30000, 40000);
+ IL.add(50000, 10000);
+ //IL.add(10000, 90000);
+ ID.depth(IL);
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++)
+ fprintf(stderr, "IL["uint32FMTW(3)"] %6u %6u count %6u value %f\n", i, IL.lo(i), IL.hi(i), IL.count(i), IL.value(i));
+ for (uint32 i=0; i<ID.numberOfIntervals(); i++)
+ fprintf(stderr, "ID["uint32FMTW(3)"] %6u %6u depth %6u value %f\n", i, ID.lo(i), ID.hi(i), ID.count(i), ID.value(i));
+
+
+}
+
+
+
+int
+main(int argc, char **argv) {
+
+ mt = mtInit(time(NULL));
+
+ test();
+
+ testIntersect(0);
+ testIntersect(1);
+ testIntersect(2);
+
+ testMerge();
+
+ exit(0);
+}
diff --git a/libutil/test/test-logMsg.C b/libutil/test/test-logMsg.C
new file mode 100644
index 0000000..a20697c
--- /dev/null
+++ b/libutil/test/test-logMsg.C
@@ -0,0 +1,27 @@
+#include "util++.H"
+
+int
+main(int argc, char **argv) {
+ logMsg M;
+
+ M.add("this is a simple test\n");
+ M.add("%s %s %s %s %s\n", "1", "2", "3", "4", "5");
+
+ M.add("%s%s%s%s%s",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n",
+ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb\n",
+ "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc\n",
+ "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd\n",
+ "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee\n");
+
+ for (int a=0; a<1024; a++) {
+ M.add("%s%s%s%s%s",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...]
+ "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb [...]
+ "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc [...]
+ "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd [...]
+ "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee [...]
+ }
+
+ M.fwrite(stdout);
+}
diff --git a/libutil/test/test-md5.c b/libutil/test/test-md5.c
new file mode 100644
index 0000000..5713b08
--- /dev/null
+++ b/libutil/test/test-md5.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "util.h"
+
+// Performs the md5 test suite using libbri. MD5 itself is tested in
+// external/md5.
+//
+// Appendix 5 of RFC 1321;
+//
+// MD5 test suite:
+// MD5 ("") = d41d8cd98f00b204e9800998ecf8427e
+// MD5 ("a") = 0cc175b9c0f1b6a831c399e269772661
+// MD5 ("abc") = 900150983cd24fb0d6963f7d28e17f72
+// MD5 ("message digest") = f96b697d7cb7938d525a2f31aaf161d0
+// MD5 ("abcdefghijklmnopqrstuvwxyz") = c3fcd3d76192e4007dfb496cca67e13b
+// MD5 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") = d174ab98d277d9f5a5611c2c9f419d9f
+// MD5 ("12345678901234567890123456789012345678901234567890123456789012345678901234567890") = 57edf4a22be3c955ac49da2e2107b67a
+//
+
+int
+testit(char *str, char *ans) {
+ md5_s m;
+ char r[33];
+ int ret = 0;
+
+ md5_toascii(md5_string(&m, str, strlen(str)), r);
+ ret = strcmp(r, ans);
+ if (ret)
+ printf("ERROR: expect %s, got %s for %s\n", ans, r, str);
+ return(ret == 0);
+}
+
+int
+main(int argc, char **argv) {
+ int ret = 7;
+
+ ret -= testit("", "d41d8cd98f00b204e9800998ecf8427e");
+ ret -= testit("a", "0cc175b9c0f1b6a831c399e269772661");
+ ret -= testit("abc", "900150983cd24fb0d6963f7d28e17f72");
+ ret -= testit("message digest", "f96b697d7cb7938d525a2f31aaf161d0");
+ ret -= testit("abcdefghijklmnopqrstuvwxyz", "c3fcd3d76192e4007dfb496cca67e13b");
+ ret -= testit("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", "d174ab98d277d9f5a5611c2c9f419d9f");
+ ret -= testit("12345678901234567890123456789012345678901234567890123456789012345678901234567890", "57edf4a22be3c955ac49da2e2107b67a");
+ exit(ret);
+}
diff --git a/libutil/test/test-mmap.c b/libutil/test/test-mmap.c
new file mode 100644
index 0000000..2642618
--- /dev/null
+++ b/libutil/test/test-mmap.c
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include "util.h"
+
+// Does a quick test of memory mapped files. First, writes a small
+// file, then it reads it back, checking the data.
+//
+// Takes one optional argument, the size in MB of the file to map.
+
+int
+main(int argc, char **argv) {
+ size_t lw;
+ uint32 *ww = 0L;
+ uint32 idx = 0;
+ uint32 err = 0;
+ FILE *out;
+ uint32 blockSize = 1048576;
+ uint32 numBlocks = 32;
+
+ if (argc == 2)
+ numBlocks = strtouint32(argv[1], 0L);
+
+ // The file must exist, and it must be large enough to contain all
+ // that we want to write. So, we create the file and fill it with
+ // junk.
+ //
+ ww = (uint32 *)malloc(sizeof(uint32) * blockSize);
+ if (ww == NULL) {
+ fprintf(stderr, "can't allocate %d uint32's for clearing the file.\n", blockSize);
+ exit(1);
+ }
+ errno = 0;
+ out = fopen("mmap.test.junk", "w");
+ if (errno) {
+ fprintf(stderr, "can't open 'mmap.test.junk' to fill with junk: %s\n", strerror(errno));
+ exit(1);
+ }
+ for (idx=0; idx<numBlocks; idx++) {
+ fprintf(stderr, "Writing initial blocks: "uint32FMT"/"uint32FMT"\r", idx, numBlocks), fflush(stderr);
+ fwrite(ww, sizeof(uint32), 1048576, out);
+ if (errno) {
+ fprintf(stderr, "can't write to 'mmap.test.junk': %s\n", strerror(errno));
+ exit(1);
+ }
+ }
+ fclose(out);
+ free(ww);
+ fprintf(stderr, "\n");
+
+ // Now, map it, and fill it with real data.
+ //
+ ww = (uint32 *)mapFile("mmap.test.junk", &lw, 'w');
+ for (idx=0; idx<numBlocks * blockSize; idx++) {
+ if ((idx & 0xfff) == 0)
+ fprintf(stderr, "Writing: "uint32FMT"/"uint32FMT"\r", idx, numBlocks * blockSize), fflush(stderr);
+ ww[idx] = idx;
+ }
+ unmapFile(ww, lw);
+ fprintf(stderr, "\n");
+
+ // Map again, and check the data.
+ //
+ ww = mapFile("mmap.test.junk", &lw, 'r');
+ for (idx=0; idx<numBlocks * blockSize; idx++) {
+ if ((idx & 0xfff) == 0)
+ fprintf(stderr, "Verifying: "uint32FMT"/"uint32FMT"\r", idx, numBlocks * blockSize), fflush(stderr);
+ if (ww[idx] != idx)
+ err++;
+ }
+ unmapFile(ww, lw);
+ fprintf(stderr, "\n");
+
+ unlink("mmap.test.junk");
+
+ return (err != 0);
+}
diff --git a/libutil/test/test-palloc.c b/libutil/test/test-palloc.c
new file mode 100644
index 0000000..9e9e792
--- /dev/null
+++ b/libutil/test/test-palloc.c
@@ -0,0 +1,65 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util.h"
+
+int
+main(int argc, char **argv) {
+ mt_s *mtctx;
+ int i;
+
+ psetdebug(2);
+ psetblocksize(1024);
+
+ palloc(2048);
+ palloc(128);
+ palloc(999);
+ palloc(1);
+ palloc(2);
+ palloc(3);
+ palloc(4);
+ palloc(2056);
+ palloc(8);
+ palloc(2064);
+ palloc(8);
+ palloc(2072);
+ palloc(8);
+
+ pdumppalloc();
+
+ pfree();
+
+ fprintf(stderr, "----------------------------------------\n");
+
+ psetblocksize(10240);
+
+ palloc(2048);
+ palloc(128);
+ palloc(999);
+ palloc(8);
+ palloc(8);
+ palloc(8);
+ palloc(8);
+ palloc(2056);
+ palloc(8);
+ palloc(2064);
+ palloc(8);
+ palloc(2072);
+ palloc(8);
+
+ pdumppalloc();
+
+ pfree();
+
+ psetdebug(0);
+ psetblocksize(16 * 1024 * 1024);
+
+ mtctx = mtInit(time(NULL));
+ for (i=0; i<512 * 1024; i++)
+ palloc(mtRandom32(mtctx) & 0xfff);
+ psetdebug(1);
+ pfree();
+
+ return(0);
+}
+
diff --git a/libutil/test/test-readBuffer.C b/libutil/test/test-readBuffer.C
new file mode 100644
index 0000000..a3c5ec7
--- /dev/null
+++ b/libutil/test/test-readBuffer.C
@@ -0,0 +1,135 @@
+#include <stdio.h>
+
+#include "util++.H"
+
+char *filename = 0L;
+md5_s *full = 0L;
+md5_s *part = 0L;
+
+
+int
+doTest(readBuffer *B, md5_s *correct, const char *description) {
+ int error = 0;
+ md5_increment_s *testing = 0L;
+ int bread = 0;
+
+ fprintf(stderr, "readBuffer test %s.\n", description);
+
+ for (char x = B->read(); !B->eof(); x = B->read()) {
+ testing = md5_increment_char(testing, x);
+ bread++;
+ }
+
+ md5_increment_finalize(testing);
+
+ if ((testing->a != correct->a) || (testing->b != correct->b)) {
+ fprintf(stderr, "readBuffer test %s failed (read %d bytes).\n", description, bread);
+ fprintf(stderr, "Got correct md5 of "uint64HEX" "uint64HEX"\n", correct->a, correct->b);
+ fprintf(stderr, "Got testing md5 of "uint64HEX" "uint64HEX"\n", testing->a, testing->b);
+ error = 1;
+ }
+
+ md5_increment_destroy(testing);
+
+ return(error);
+}
+
+
+int
+doTestRead(readBuffer *B, md5_s *correct, size_t bufferSize, const char *description) {
+ int error = 0;
+ char *buffer = new char [bufferSize];
+ size_t bufferLen = 0;
+
+ md5_increment_s *testing = 0L;
+
+ fprintf(stderr, "readBuffer test %s.\n", description);
+
+ while (!B->eof()) {
+ bufferLen = B->read(buffer, bufferSize);
+ //fprintf(stderr, "Read bufferLen=%d bufferSize=%d\n", bufferLen, bufferSize);
+ testing = md5_increment_block(testing, buffer, bufferLen);
+ }
+
+ md5_increment_finalize(testing);
+
+ if ((testing->a != correct->a) || (testing->b != correct->b)) {
+ fprintf(stderr, "readBuffer test %s failed.\n", description);
+ fprintf(stderr, "Got correct md5 of "uint64HEX" "uint64HEX"\n", correct->a, correct->b);
+ fprintf(stderr, "Got testing md5 of "uint64HEX" "uint64HEX"\n", testing->a, testing->b);
+ error = 1;
+ }
+
+ md5_increment_destroy(testing);
+
+ return(error);
+}
+
+
+int
+main(int argc, char **argv) {
+ int error = 0;
+ readBuffer *B = 0L;
+
+ size_t L = 0;
+ size_t H = 0;
+ size_t R = 0;
+
+ // If we are given a file, use that, otherwise, use ourself.
+ //
+ filename = argv[argc-1];
+
+ L = sizeOfFile(filename);
+ H = L/2;
+ R = L - H;
+
+ fprintf(stderr, "L=%d H=%d R=%d\n", L, H, R);
+
+ // Suck in the whole file, compute the correct md5 checksum on it
+ //
+ char *c = new char [L];
+
+ FILE *F = fopen(filename, "r");
+ fread(c, sizeof(char), L, F);
+ fclose(F);
+ full = md5_string(0L, c, L);
+ part = md5_string(0L, c+H, R);
+
+ delete [] c;
+
+
+ B = new readBuffer(filename, 999);
+ error += doTest(B, full, "#1 (read)");
+ B->seek(0);
+ error += doTest(B, full, "#2 (seek)");
+ B->seek(H);
+ error += doTest(B, part, "#2 (seek half)");
+ delete B;
+
+ B = new readBuffer(filename, 0);
+ error += doTest(B, full, "#3 (mmap)");
+ B->seek(0);
+ error += doTest(B, full, "#2 (mmap seek)");
+ B->seek(H);
+ error += doTest(B, part, "#2 (mmap seek half)");
+ delete B;
+
+ B = new readBuffer(filename, 0);
+ error += doTestRead(B, full, 10000, "#4 (read buffer=mmap readsize=10000)");
+ delete B;
+
+ B = new readBuffer(filename, 100);
+ error += doTestRead(B, full, 10000, "#4 (read buffer=100 readsize=10000)");
+ delete B;
+
+ B = new readBuffer(filename, 2000);
+ error += doTestRead(B, full, 1000, "#4 (read buffer=2000 readsize=1000)");
+ delete B;
+
+ B = new readBuffer(filename, L);
+ error += doTestRead(B, full, L+1000, "#5 (read buffer=filesize readsize=filesize+1000)");
+ delete B;
+
+ return(error);
+}
+
diff --git a/libutil/test/test-recordFile.C b/libutil/test/test-recordFile.C
new file mode 100644
index 0000000..d425c2b
--- /dev/null
+++ b/libutil/test/test-recordFile.C
@@ -0,0 +1,64 @@
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+
+#include "util++.H"
+
+struct header_s {
+ uint64 t1;
+ char s1[570];
+ uint64 t2;
+};
+
+struct record_s {
+ uint64 t1;
+ char s1[123];
+};
+
+int
+main(int argc, char **argv) {
+ header_s h;
+ record_s r;
+
+ h.t1 = 0x0123456789abcdefllu;
+ memset(h.s1, 0x66, 570);
+ strcpy(h.s1, "this is the header");
+ h.t2 = 0xdeadbeefdeadbeefllu;
+
+ recordFile *RF = new recordFile("test", sizeof(header_s), sizeof(record_s), 'w');
+
+ memcpy(RF->header(), &h, sizeof(header_s));
+
+ r.t1 = 1; memset(r.s1, 0x66, 123); strcpy(r.s1, "record1");
+ RF->putRecord(&r);
+
+ r.t1 = 2; memset(r.s1, 0x66, 123); strcpy(r.s1, "record2");
+ RF->putRecord(&r);
+
+ r.t1 = 3; memset(r.s1, 0x66, 123); strcpy(r.s1, "record3");
+ RF->putRecord(&r);
+
+ r.t1 = 4; memset(r.s1, 0x66, 123); strcpy(r.s1, "record4");
+ RF->putRecord(&r);
+
+ r.t1 = 5; memset(r.s1, 0x66, 123); strcpy(r.s1, "record5");
+ RF->putRecord(&r);
+
+ delete RF;
+
+ RF = new recordFile("test", sizeof(header_s), sizeof(record_s), 'r');
+
+ header_s *hh = (header_s *)RF->header();
+
+ fprintf(stderr, "header t1 "uint64HEX" '%s' t2 "uint64HEX"\n", hh->t1, hh->s1, hh->t2);
+ RF->getRecord(&r); fprintf(stderr, "record "uint64FMT" '%s'\n", r.t1, r.s1);
+ RF->getRecord(&r); fprintf(stderr, "record "uint64FMT" '%s'\n", r.t1, r.s1);
+ RF->getRecord(&r); fprintf(stderr, "record "uint64FMT" '%s'\n", r.t1, r.s1);
+ RF->getRecord(&r); fprintf(stderr, "record "uint64FMT" '%s'\n", r.t1, r.s1);
+ RF->getRecord(&r); fprintf(stderr, "record "uint64FMT" '%s'\n", r.t1, r.s1);
+ RF->getRecord(&r); fprintf(stderr, "record "uint64FMT" '%s'\n", r.t1, r.s1);
+
+ delete RF;
+
+ return(0);
+}
diff --git a/libutil/test/test-types.c b/libutil/test/test-types.c
new file mode 100644
index 0000000..15c3df0
--- /dev/null
+++ b/libutil/test/test-types.c
@@ -0,0 +1,34 @@
+#include <stdio.h>
+
+#include "util.h"
+
+int
+main(void) {
+ uint32 errors = 0;
+ uint32 u3 = -1;
+ int32 s3 = -1;
+ uint64 u6 = -1;
+ int64 s6 = -1;
+
+ if (sizeof(uint32) != 4)
+ fprintf(stderr, "uint32 has %d bytes (should be 4)!\n", (int)sizeof(uint32)), errors++;
+
+ if (sizeof(uint64) != 8)
+ fprintf(stderr, "uint64 has %d bytes (should be 8)!\n", (int)sizeof(uint64)), errors++;
+
+ if (u3 < 0)
+ fprintf(stderr, "uint32 is signed (should be unsigned)!\n"), errors++;
+
+ if (s3 > 0)
+ fprintf(stderr, "int32 is unsigned (should be signed)!\n"), errors++;
+
+ if (u6 < 0)
+ fprintf(stderr, "uint64 is signed (should be unsigned)!\n"), errors++;
+
+ if (s6 > 0)
+ fprintf(stderr, "int64 is unsigned (should be signed)!\n"), errors++;
+
+ return(errors);
+}
+
+
diff --git a/libutil/uint32List.H b/libutil/uint32List.H
new file mode 100644
index 0000000..610a004
--- /dev/null
+++ b/libutil/uint32List.H
@@ -0,0 +1,62 @@
+#ifndef UINT32LIST_H
+#define UINT32LIST_H
+
+#include <string.h>
+
+// A very simple integer list. Hopefully lighter weight than a
+// vector.
+
+// It might be useful to extend this to have 'undef' values,
+// and to allow shift(), pop().
+
+class uint32List {
+public:
+ uint32List(uint32 max=16) {
+ _len = 0;
+ _max = max;
+ _lst = new uint32 [_max];
+ };
+ ~uint32List() {
+ delete [] _lst;
+ };
+
+private:
+ void resize(uint32 idx) {
+ if (idx >= _max) {
+ _max *= 2;
+ uint32 *L = new uint32 [_max];
+ memcpy(L, _lst, sizeof(uint32) * _len);
+ delete [] _lst;
+ _lst = L;
+ }
+ if (idx >= _len)
+ _len = idx + 1;
+ }
+
+public:
+ uint32 &operator[](uint32 idx) {
+ resize(idx);
+ return(_lst[idx]);
+ }
+
+ void push(uint32 val) {
+ resize(_len);
+ _lst[_len++] = val;
+ }
+
+ uint32 length(void) {
+ return(_len);
+ };
+
+ void clear(void) {
+ _len = 0;
+ }
+
+private:
+ uint32 _len;
+ uint32 _max;
+ uint32 *_lst;
+};
+
+
+#endif // UINT32LIST_H
diff --git a/libutil/unaryEncoding.h b/libutil/unaryEncoding.h
new file mode 100644
index 0000000..f856511
--- /dev/null
+++ b/libutil/unaryEncoding.h
@@ -0,0 +1,76 @@
+#ifndef UNARY_ENCODING_H
+#define UNARY_ENCODING_H
+
+#include "bitPacking.h"
+
+
+// Routines to store and retrieve a unary encoded number to/from a
+// bit packed word array based at 'ptr' and currently at location
+// 'pos'. Both routines return the size of the encoded number in
+// 'siz'.
+
+
+
+// The usual unary encoding. Store the number n as n 0 bits followed
+// by a single 1 bit.
+//
+// 0 -> 1
+// 1 -> 01
+// 2 -> 001
+// 3 -> 0001
+// 4 -> 00001
+//
+// See the decoder as to why we use 0 instead of 1 for the count.
+
+
+inline
+void
+setUnaryEncodedNumber(uint64 *ptr,
+ uint64 pos,
+ uint64 *siz,
+ uint64 val) {
+
+ *siz = val + 1;
+
+ while (val >= 64) {
+ setDecodedValue(ptr, pos, 64, uint64ZERO);
+ pos += 64;
+ val -= 64;
+ siz += 64;
+ }
+
+ setDecodedValue(ptr, pos, val + 1, uint64ONE);
+ pos += val + 1;
+}
+
+
+
+inline
+uint64
+getUnaryEncodedNumber(uint64 *ptr,
+ uint64 pos,
+ uint64 *siz) {
+ uint64 val = uint64ZERO;
+ uint64 enc = uint64ZERO;
+
+ // How many whole words are zero?
+ //
+ enc = getDecodedValue(ptr, pos, 64);
+ while (enc == uint64ZERO) {
+ val += 64;
+ pos += 64;
+ enc = getDecodedValue(ptr, pos, 64);
+ }
+
+ // This word isn't zero. Count how many bits are zero (see, the
+ // choice of 0 or 1 for the encoding wasn't arbitrary!)
+ //
+ val += 64 - logBaseTwo64(enc);
+
+ *siz = val + 1;
+
+ return(val);
+}
+
+
+#endif // UNARY_ENCODING_H
diff --git a/libutil/unaryEncodingTester.C b/libutil/unaryEncodingTester.C
new file mode 100644
index 0000000..593a413
--- /dev/null
+++ b/libutil/unaryEncodingTester.C
@@ -0,0 +1,199 @@
+#include "util++.H"
+
+
+uint64 numLoops = 1;
+uint64 numNums = 4000000;
+uint64 numSize = 300;
+
+// The space in bits that we can play with, and the pointer to said space.
+//
+uint64 spa = 128 * 1024 * 1024 * 8;
+uint64 *ptr = 0L;
+uint64 *rnd = 0L;
+
+void
+testUnary(void) {
+ uint64 pos = uint64ZERO;
+ uint64 siz = uint64ZERO;
+ uint64 val = uint64ZERO;
+ uint64 i = uint64ZERO;
+
+ for (i=0; i<numNums; i++) {
+ setUnaryEncodedNumber(ptr, pos, &siz, rnd[i]);
+ pos += siz;
+ if (pos + 1000 >= spa) {
+ fprintf(stderr, "ERROR: Ran out of space in testUnary at number "uint64FMT" out of "uint64FMT"\n", i, numNums);
+ exit(1);
+ }
+ }
+
+ //fprintf(stderr, "unaryEncodedNumbers used "uint64FMT"MB of storage out of "uint64FMT"MB.\n", pos >> 23, spa >> 23);
+
+ pos = uint64ZERO;
+
+ for (i=0; i<numNums; i++) {
+ val = getUnaryEncodedNumber(ptr, pos, &siz);
+ if (val != rnd[i]) {
+ fprintf(stderr, "Number "uint64FMT" at bitpos "uint64FMT" failed. Desired "uint64FMT" got "uint64FMT"\n", i, pos, rnd[i], val);
+ exit(1);
+ }
+ pos += siz;
+ }
+
+ fprintf(stderr, "unary encoded numbers OK!\n");
+}
+
+
+
+void
+testGeneralizedUnary(void) {
+ uint64 pos = uint64ZERO;
+ uint64 siz = uint64ZERO;
+ uint64 val = uint64ZERO;
+ uint64 i = uint64ZERO;
+
+ for (i=0; i<numNums; i++) {
+ setGeneralizedUnaryEncodedNumber(ptr, pos, &siz, rnd[i]);
+ pos += siz;
+ if (pos + 1000 >= spa) {
+ fprintf(stderr, "ERROR: Ran out of space in testGeneralizedUnary at number "uint64FMT" out of "uint64FMT"\n", i, numNums);
+ exit(1);
+ }
+ }
+
+ //fprintf(stderr, "generalizedUnaryEncodedNumbers used "uint64FMT"MB of storage out of "uint64FMT"MB.\n", pos >> 23, spa >> 23);
+
+ pos = uint64ZERO;
+
+ for (i=0; i<numNums; i++) {
+ val = getGeneralizedUnaryEncodedNumber(ptr, pos, &siz);
+ if (val != rnd[i]) {
+ fprintf(stderr, "Number "uint64FMT" at bitpos "uint64FMT" failed. Desired "uint64FMT" got "uint64FMT"\n", i, pos, rnd[i], val);
+ exit(1);
+ }
+ pos += siz;
+ }
+
+ fprintf(stderr, "generalized unary encoded numbers OK!\n");
+}
+
+
+
+
+void
+testEliasGamma(void) {
+ uint64 pos = uint64ZERO;
+ uint64 siz = uint64ZERO;
+ uint64 val = uint64ZERO;
+ uint64 i = uint64ZERO;
+
+ for (i=0; i<numNums; i++) {
+ setEliasGammaEncodedNumber(ptr, pos, &siz, rnd[i]);
+ pos += siz;
+ if (pos + 1000 >= spa) {
+ fprintf(stderr, "ERROR: Ran out of space in testGeneralizedUnary at number "uint64FMT" out of "uint64FMT"\n", i, numNums);
+ exit(1);
+ }
+ }
+
+ //fprintf(stderr, "eliasGammaEncodedNumbers used "uint64FMT"MB of storage out of "uint64FMT"MB.\n", pos >> 23, spa >> 23);
+
+ pos = uint64ZERO;
+
+ for (i=0; i<numNums; i++) {
+ val = getEliasGammaEncodedNumber(ptr, pos, &siz);
+ if (val != rnd[i]) {
+ fprintf(stderr, "Number "uint64FMT" at bitpos "uint64FMT" failed. Desired "uint64FMT" got "uint64FMT"\n", i, pos, rnd[i], val);
+ exit(1);
+ }
+ pos += siz;
+ }
+
+ fprintf(stderr, "Elias gamma encoded numbers OK!\n");
+}
+
+
+
+void
+testEliasDelta(void) {
+ uint64 pos = uint64ZERO;
+ uint64 siz = uint64ZERO;
+ uint64 val = uint64ZERO;
+ uint64 i = uint64ZERO;
+
+ for (i=0; i<numNums; i++) {
+ setEliasDeltaEncodedNumber(ptr, pos, &siz, rnd[i]);
+ pos += siz;
+ if (pos + 1000 >= spa) {
+ fprintf(stderr, "ERROR: Ran out of space in testGeneralizedUnary at number "uint64FMT" out of "uint64FMT"\n", i, numNums);
+ exit(1);
+ }
+ }
+
+ //fprintf(stderr, "eliasDeltaEncodedNumbers used "uint64FMT"MB of storage out of "uint64FMT"MB.\n", pos >> 23, spa >> 23);
+
+ pos = uint64ZERO;
+
+ for (i=0; i<numNums; i++) {
+ val = getEliasDeltaEncodedNumber(ptr, pos, &siz);
+ if (val != rnd[i]) {
+ fprintf(stderr, "Number "uint64FMT" at bitpos "uint64FMT" failed. Desired "uint64FMT" got "uint64FMT"\n", i, pos, rnd[i], val);
+ exit(1);
+ }
+ pos += siz;
+ }
+
+ fprintf(stderr, "Elias delta encoded numbers OK!\n");
+}
+
+
+
+
+
+int
+main(int argc, char **argv) {
+
+ if (argc != 3) {
+ fprintf(stderr, "usage: %s <num-loops> <num-nums-per-loop>\n", argv[0]);
+ fprintf(stderr, " -> DEFAULTS USED <-\n");
+ } else {
+ numLoops = strtouint32(argv[1], 0L);
+ numNums = strtouint32(argv[2], 0L);
+ }
+
+ rnd = new uint64 [numNums];
+ ptr = new uint64 [spa >> 6];
+
+ mt_s *ctx = mtInit(time(NULL));
+
+ // Generate some random numbers to store
+ //
+ while (numLoops--) {
+
+ // Test out unary encodings on small numbers
+ //
+ for (uint64 i=0; i<numNums; i++)
+ rnd[i] = mtRandom32(ctx) % numSize;
+ testUnary();
+
+ // Generalized unary encoding can handle larger numbers
+ //
+ for (uint64 i=0; i<numNums; i++)
+ rnd[i] = mtRandom32(ctx);
+ testGeneralizedUnary();
+
+ // Elias Gamma and Delta codes are probably pretty good
+ //
+ for (uint64 i=0; i<numNums; i++)
+ rnd[i] = mtRandom64(ctx);
+ testEliasGamma();
+ testEliasDelta();
+ }
+
+ delete [] rnd;
+ delete [] ptr;
+
+ exit(0);
+}
+
+
diff --git a/libutil/util++.H b/libutil/util++.H
new file mode 100644
index 0000000..7929b8a
--- /dev/null
+++ b/libutil/util++.H
@@ -0,0 +1,46 @@
+#ifndef UTIL_PLUS_PLUS_H
+#define UTIL_PLUS_PLUS_H
+
+#include "util.h"
+
+// These are all inlined, and C doesn't want to listen to that, so
+// they're here.
+//
+#include "bitOperations.h"
+#include "bitPacking.h"
+#include "endianess.H"
+
+// Various methods for encoding numbers into a bitstream.
+//
+// Still missing:
+// minimal binary
+// golomb (actually rice, since power of two)
+// teuhola exponential golomb
+//
+// And a nice way of getting parameters to those (and generalizedUnary)
+//
+#include "unaryEncoding.h"
+#include "generalizedUnaryEncoding.h"
+#include "eliasGammaEncoding.h"
+#include "eliasDeltaEncoding.h"
+#include "fibonacciEncoding.h"
+
+// Lists?
+#include "uint32List.H"
+
+// Now the good stuff!
+//
+#include "speedCounter.H"
+//#include "bzipBuffer.H"
+#include "readBuffer.H"
+#include "splitToWords.H"
+#include "bitPackedArray.H"
+#include "bitPackedFile.H"
+#include "recordFile.H"
+#include "intervalList.H"
+#include "bigQueue.H"
+#include "sweatShop.H"
+#include "logMsg.H"
+
+#endif // UTIL_PLUS_PLUS_H
+
diff --git a/libutil/util.c b/libutil/util.c
new file mode 100644
index 0000000..e0fd42e
--- /dev/null
+++ b/libutil/util.c
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/utsname.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+
+#include "util.h"
+
+
+double
+getTime(void) {
+ struct timeval tp;
+ gettimeofday(&tp, NULL);
+ return(tp.tv_sec + (double)tp.tv_usec / 1000000.0);
+}
+
+
+uint64
+getProcessSizeCurrent(void) {
+ struct rusage ru;
+ uint64 sz = 0;
+
+ errno = 0;
+ if (getrusage(RUSAGE_SELF, &ru) == -1) {
+ fprintf(stderr, "getProcessSizeCurrent()-- getrusage(RUSAGE_SELF, ...) failed: %s\n",
+ strerror(errno));
+ } else {
+ sz = ru.ru_maxrss;
+ sz *= 1024;
+ }
+
+ return(sz);
+}
+
+
+uint64
+getProcessSizeLimit(void) {
+ struct rlimit rlp;
+ uint64 sz = ~uint64ZERO;
+
+ errno = 0;
+ if (getrlimit(RLIMIT_DATA, &rlp) == -1) {
+ fprintf(stderr, "getProcessSizeLimit()-- getrlimit(RLIMIT_DATA, ...) failed: %s\n",
+ strerror(errno));
+ } else {
+ sz = rlp.rlim_cur;
+ }
+
+ return(sz);
+}
+
+
+
+
+void *
+memdup(const void *orig, size_t size) {
+ void *rslt = NULL;
+
+ if ((orig != NULL) && (size > 0)) {
+ errno = 0;
+ rslt = malloc(size);
+ if (errno) {
+ // Some ugliness to print out a size_t. This might be useless,
+ // as it might be determined by TRUEINT64.
+ //
+ if (sizeof(size_t) == 8)
+ fprintf(stderr, "memdup()-- can't allocate "int64FMT" bytes.\n%s\n", (int64)size, strerror(errno));
+ else
+ fprintf(stderr, "memdup()-- can't allocate "uint32FMT" bytes.\n%s\n", (uint32)size, strerror(errno));
+ exit(1);
+ }
+ memcpy(rslt, orig, size);
+ }
+ return(rslt);
+}
+
+
+int
+fileExists(const char *path) {
+ struct stat s;
+
+ return(stat(path, &s) == 0);
+}
+
+
+off_t
+sizeOfFile(const char *path) {
+ struct stat s;
+
+ errno = 0;
+ if (stat(path, &s) != 0)
+ fprintf(stderr, "Couldn't stat() '%s'\n%s\n", path, strerror(errno)), exit(1);
+
+ return(s.st_size);
+}
+
+
+uint64
+timeOfFile(const char *path) {
+ struct stat s;
+
+ errno = 0;
+ if (stat(path, &s) != 0)
+ fprintf(stderr, "Couldn't stat() '%s'\n%s\n", path, strerror(errno)), exit(1);
+
+ return(s.st_mtime);
+}
diff --git a/libutil/util.h b/libutil/util.h
new file mode 100644
index 0000000..30ae462
--- /dev/null
+++ b/libutil/util.h
@@ -0,0 +1,356 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+// ISO C99 says that to get INT32_MAX et al, these must be defined. (7.18.2, 7.18.4, 7.8.1)
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS
+#endif
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <ctype.h>
+
+#include <inttypes.h>
+
+// Useful types.
+//
+// *MASK(x) is only defined for unsigned types, with x != 0 and less
+// than the datawidth.
+
+typedef uint64_t uint64;
+typedef uint32_t uint32;
+typedef uint16_t uint16;
+typedef uint8_t uint8;
+
+typedef int64_t int64;
+typedef int32_t int32;
+typedef int16_t int16;
+typedef int8_t int8;
+
+
+#if defined(__alpha) || defined(_AIX) || defined(__LP64__) || defined(_LP64)
+#define TRUE64BIT
+#define uint64NUMBER(X) X ## LU
+#define uint32NUMBER(X) X ## U
+#else
+#define uint64NUMBER(X) X ## LLU
+#define uint32NUMBER(X) X ## LU
+#endif
+
+
+#define sizetFMT "%zd"
+
+#define uint64ZERO uint64NUMBER(0x0000000000000000)
+#define uint64ONE uint64NUMBER(0x0000000000000001)
+#define uint64MAX uint64NUMBER(0xffffffffffffffff)
+#define uint64MASK(X) ((~uint64ZERO) >> (64 - (X)))
+#define uint64FMTW(X) "%" #X PRIu64
+#define uint64FMT "%"PRIu64
+#define uint64HEX "0x%016"PRIx64
+#define int64FMTW(X) "%" #X PRId64
+#define int64FMT "%"PRId64
+
+#define uint32ZERO uint32NUMBER(0x00000000)
+#define uint32ONE uint32NUMBER(0x00000001)
+#define uint32MAX uint32NUMBER(0xffffffff)
+#define uint32MASK(X) ((~uint32ZERO) >> (32 - (X)))
+#define uint32FMTW(X) "%" #X PRIu32
+#define uint32FMT "%"PRIu32
+#define uint32HEX "0x%08"PRIx32
+#define int32FMTW(X) "%" #X PRId32
+#define int32FMT "%"PRId32
+
+#define uint16ZERO (0x0000)
+#define uint16ONE (0x0001)
+#define uint16MAX (0xffff)
+#define uint16MASK(X) ((~uint16ZERO) >> (16 - (X)))
+#define uint16FMTW(X) "%" #X PRIu16
+#define uint16FMT "%"PRIu16
+
+#define uint8ZERO (0x00)
+#define uint8ONE (0x01)
+#define uint8MAX (0xff)
+#define uint8MASK(X) ((~uint8ZERO) >> (8 - (X)))
+
+#define strtouint32(N,O) (uint32)strtoul(N, O, 10)
+#define strtouint64(N,O) (uint64)strtoul(N, O, 10)
+
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+
+////////////////////////////////////////
+//
+// time
+//
+double getTime(void);
+
+
+
+////////////////////////////////////////
+//
+// file
+//
+
+// Create the O_LARGEFILE type for open(), if it doesn't already
+// exist (FreeBSD, Tru64). We assume that by including the stuff
+// needed for open(2) we'll get any definition of O_LARGEFILE.
+//
+#ifndef O_LARGEFILE
+#define O_LARGEFILE 0
+#endif
+
+
+uint64 getProcessSizeCurrent(void);
+uint64 getProcessSizeLimit(void);
+
+
+// Useful routines for dealing with the existence of files
+
+int isHuman(FILE *F);
+
+// Handles mmap() of files. Write is not tested -- in particluar,
+// the test main() in mmap.c fails.
+//
+void*
+mapFile(const char *filename,
+ uint64 *length,
+ char mode);
+
+void
+unmapFile(void *addr,
+ uint64 length);
+
+
+
+// Creates a hidden temporary file. If path is given, the temporary
+// file is created in that directory. The temoprary file is unlinked
+// after it is created, so once you close the file, it's gone.
+//
+FILE *makeTempFile(char *path);
+
+
+// Copies all of srcFile to dstFile, returns the number of bytes written
+//
+off_t copyFile(char *srcName, FILE *dstFile);
+
+
+// Takes a path to a file (that possibly doesn't exist) and returns
+// the number of MB (1048576 bytes) free in the directory of that
+// file.
+//
+uint32 freeDiskSpace(char *path);
+
+// Safer read(2) and write(2).
+//
+void safeWrite(int filedes, const void *buffer, const char *desc, size_t nbytes);
+int safeRead(int filedes, const void *buffer, const char *desc, size_t nbytes);
+
+
+
+////////////////////////////////////////
+//
+int fileExists(const char *path);
+off_t sizeOfFile(const char *path);
+uint64 timeOfFile(const char *path);
+
+// Open a file, read/write, using compression based on the file name
+//
+FILE *openFile(const char *path, const char *mode);
+void closeFile(FILE *F, const char *path);
+
+////////////////////////////////////////
+//
+void *memdup(const void *orig, size_t size);
+
+
+////////////////////////////////////////
+//
+// Pac-Man's memory allocator.
+//
+// Grabs big chunks of memory, then gives out little pieces. You can
+// only free ALL memory, not single blocks.
+//
+// This is useful when one needs to malloc() tens of millions of
+// things, at which point the overhead of finding a free block is
+// large.
+//
+void *palloc(size_t size);
+void pfree(void);
+
+// A thread-safe(r) implementation just forces the user to use a
+// handle. This also lets us use palloc() for collections of things
+// -- e.g., twice in a program. If you don't give a handle, the
+// default one is used.
+//
+void *palloc2(size_t size, void *handle);
+void pfree2(void *handle);
+
+// Get a new handle, release a used one. The size is the same
+// as for psetblocksize().
+//
+void *pallochandle(size_t size);
+void pfreehandle(void *handle);
+
+// The block size can only be changed before the first call to
+// palloc(). Calling psetblocksize() after that has no effect.
+//
+void psetblocksize(size_t size);
+size_t pgetblocksize(void);
+
+// Not generally useful - just dumps the allocated blocks to stdout.
+// Uses internal structures, and used in the test routine.
+//
+// psetdebug() enables reporting of allocations.
+//
+void pdumppalloc(void *handle);
+void psetdebug(int on);
+
+
+////////////////////////////////////////
+//
+// md5
+//
+
+
+typedef struct {
+ uint64 a;
+ uint64 b;
+ uint32 i; // the iid, used in leaff
+ uint32 pad; // keep us size compatible between 32- and 64-bit machines.
+} md5_s;
+
+#define MD5_BUFFER_SIZE 32*1024
+
+typedef struct {
+ uint64 a;
+ uint64 b;
+ void *context;
+ int bufferPos;
+ unsigned char buffer[MD5_BUFFER_SIZE];
+} md5_increment_s;
+
+
+// Returns -1, 0, 1 depending on if a <, ==, > b. Suitable for
+// qsort().
+//
+int md5_compare(void const *a, void const *b);
+
+
+// Converts an md5_s into a character string. s must be at least
+// 33 bytes long.
+//
+char *md5_toascii(md5_s *m, char *s);
+
+
+// Computes the md5 checksum on the string s.
+//
+md5_s *md5_string(md5_s *m, char *s, uint32 l);
+
+
+// Computes an md5 checksum piece by piece.
+//
+// If m is NULL, a new md5_increment_s is allocated and returned.
+//
+md5_increment_s *md5_increment_char(md5_increment_s *m, char s);
+md5_increment_s *md5_increment_block(md5_increment_s *m, char *s, uint32 l);
+void md5_increment_finalize(md5_increment_s *m);
+void md5_increment_destroy(md5_increment_s *m);
+
+
+////////////////////////////////////////
+//
+// Matsumoto and Nichimura's Mersenne Twister pseudo random number
+// generator. The struct and functions are defined in external/mt19937ar.[ch]
+//
+typedef struct mtctx mt_s;
+
+mt_s *mtInit(uint32 s);
+mt_s *mtInitArray(uint32 *init_key, uint32 key_length);
+uint32 mtRandom32(mt_s *mt);
+
+// A uint64 random number
+//
+#define mtRandom64(MT) ( (((uint64)mtRandom32(MT)) << 32) | (uint64)mtRandom32(MT) )
+
+// Real valued randomness
+// mtRandomRealOpen() -- on [0,1) real interval
+// mtRandomRealClosed() -- on [0,1] real interval
+// mrRandomRealOpen53() -- on [0,1) real interval, using 53 bits
+//
+// "These real versions are due to Isaku Wada, 2002/01/09 added" and were taken from
+// the mt19937ar.c distribution (but they had actual functions, not macros)
+//
+// They also had
+// random number in (0,1) as (mtRandom32() + 0.5) * (1.0 / 4294967296.0)
+//
+#define mtRandomRealOpen(MT) ( (double)mtRandom32(MT) * (1.0 / 4294967296.0) )
+#define mtRandomRealClosed(MT) ( (double)mtRandom32(MT) * (1.0 / 4294967295.0) )
+#define mtRandomRealOpen53(MT) ( ((mtRandom32(MT) >> 5) * 67108864.0 + (mtRandom32(MT) >> 6)) * (1.0 / 9007199254740992.0) )
+
+// returns a random number with gaussian distribution, mean of zero and std.dev. of 1
+//
+double mtRandomGaussian(mt_s *mt);
+
+
+////////////////////////////////////////
+//
+// FreeBSD's multithreaded qsort.
+//
+void
+qsort_mt(void *a,
+ size_t n,
+ size_t es,
+ int (*cmp)(const void *, const void *),
+ int maxthreads,
+ int forkelem);
+
+//#define qsort(A, N, ES, CMP) qsort_mt((A), (N), (ES), (CMP), 4, 64 * 1024)
+
+
+
+////////////////////////////////////////
+//
+// perl's chomp is pretty nice
+//
+#ifndef chomp
+#define chomp(S) { char *t=S; while (*t) t++; t--; while (isspace(*t)) { *t--=0; } }
+#define chompL(S,L) { char *t=S; while (*t) t++; t--; while (isspace(*t)) { *t--=0; L--; } }
+#endif
+
+#ifndef munch
+#define munch(S) { while (*(S) && isspace(*(S))) (S)++; }
+#endif
+
+#ifndef crunch
+#define crunch(S) { while (*(S) && !isspace(*(S))) (S)++; }
+#endif
+
+
+#ifndef MIN
+#define MIN(x,y) (((x) > (y)) ? (y) : (x))
+#endif
+
+#ifndef MAX
+#define MAX(x,y) (((x) < (y)) ? (y) : (x))
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // UTIL_H
diff --git a/meryl/Make.include b/meryl/Make.include
new file mode 100644
index 0000000..494d4a2
--- /dev/null
+++ b/meryl/Make.include
@@ -0,0 +1,43 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBMERYL/ :=$(realpath $/../libmeryl/)/
+LIBKMER/ :=$(realpath $/../libkmer/)/
+
+merylsrc := $/args.C \
+ $/binaryOp.C \
+ $/build.C \
+ $/build-threads.C \
+ $/dump.C \
+ $/estimate.C \
+ $/merge.C \
+ $/unaryOp.C
+
+# Removed m.C from SRCS. and m from EXES -- what's it do??
+
+# meryl.H is exported only for celera-assembler.
+
+$/.CXX_SRCS := ${merylsrc} $/meryl.C $/mervin.C $/asmMerQC.C $/mapMers.C $/mapMers-depth.C $/maskMers.C $/compare-counts.C $/simple.C
+$/.CXX_INCS := $/meryl.H
+$/.CXX_LIBS := $/libmerylguts.a
+$/.CXX_EXES := $/meryl $/mervin $/simple $/asmMerQC $/mapMers $/mapMers-depth $/testPositionBias $/maskMers $/compare-counts
+$/.CLEAN := $/*.o
+
+$/libmerylguts.a : ${merylsrc:.C=.o}
+
+$/meryl : $/meryl.o $/libmerylguts.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/mervin : $/mervin.o ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/simple : $/simple.o ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/asmMerQC : $/asmMerQC.o ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/mapMers : $/mapMers.o ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/mapMers-depth : $/mapMers-depth.o ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/m : $/m.o ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/testPositionBias : $/testPositionBias.o ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/maskMers : $/maskMers.o ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/compare-counts : $/compare-counts.o ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBMERYL/} -I${LIBKMER/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
+
+$/%.d: ${LIBBIO/}alphabet.h
diff --git a/meryl/args.C b/meryl/args.C
new file mode 100644
index 0000000..f1df368
--- /dev/null
+++ b/meryl/args.C
@@ -0,0 +1,589 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include "bio++.H"
+#include "meryl.H"
+
+
+// Some string handling utilities.
+//
+bool
+writeString(const char *str, FILE *F) {
+ errno = 0;
+
+ uint32 len = 0;
+ if (str) {
+ len = (uint32)strlen(str) + 1;
+ fwrite(&len, sizeof(uint32), 1, F);
+ fwrite( str, sizeof(char), len, F);
+ } else {
+ fwrite(&len, sizeof(uint32), 1, F);
+ }
+
+ if (errno) {
+ fprintf(stderr, "writeString()-- Failed to write string of length "uint32FMT": %s\n", len, strerror(errno));
+ fprintf(stderr, "writeString()-- First 80 bytes of string is:\n");
+ fprintf(stderr, "%80.80s\n", str);
+ return(false);
+ }
+
+ return(true);
+}
+
+char*
+readString(FILE *F) {
+ errno = 0;
+
+ uint32 len = 0;
+ fread(&len, sizeof(uint32), 1, F);
+ if (errno) {
+ fprintf(stderr, "readString()-- Failed to read string: %s\n", strerror(errno));
+ exit(1);
+ }
+
+ char *str = 0L;
+
+ if (len > 0) {
+ str = new char [len];
+ fread(str, sizeof(char), len, F);
+ if (errno) {
+ fprintf(stderr, "readString()-- Failed to read string: %s\n", strerror(errno));
+ exit(1);
+ }
+ }
+
+ return(str);
+}
+
+char*
+duplString(char *str) {
+ char *dupstr = 0L;
+ if (str) {
+ uint32 len = (uint32)strlen(str);
+ dupstr = new char [len+1];
+ strcpy(dupstr, str);
+ }
+ return(dupstr);
+}
+
+
+
+void
+merylArgs::usage(void) {
+ fprintf(stderr, "usage: %s [personality] [global options] [options]\n", execName);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "where personality is:\n");
+ fprintf(stderr, " -P -- compute parameters\n");
+ fprintf(stderr, " -B -- build table\n");
+ fprintf(stderr, " -S -- scan table\n");
+ fprintf(stderr, " -M -- \"math\" operations\n");
+ fprintf(stderr, " -D -- dump table\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "-P: Given a sequence file (-s) or an upper limit on the\n");
+ fprintf(stderr, " number of mers in the file (-n), compute the table size\n");
+ fprintf(stderr, " (-t in build) to minimize the memory usage.\n");
+ fprintf(stderr, " -m # (size of a mer; required)\n");
+ fprintf(stderr, " -c # (homopolymer compression; optional)\n");
+ fprintf(stderr, " -p (enable positions)\n");
+ fprintf(stderr, " -s seq.fasta (seq.fasta is scanned to determine the number of mers)\n");
+ fprintf(stderr, " -n # (compute params assuming file with this many mers in it)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Only one of -s, -n need to be specified. If both are given\n");
+ fprintf(stderr, " -s takes priority.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "-B: Given a sequence file (-s) and lots of parameters, compute\n");
+ fprintf(stderr, " the mer-count tables. By default, both strands are processed.\n");
+ fprintf(stderr, " -f (only build for the forward strand)\n");
+ fprintf(stderr, " -r (only build for the reverse strand)\n");
+ fprintf(stderr, " -C (use canonical mers, assumes both strands)\n");
+ fprintf(stderr, " -L # (DON'T save mers that occur less than # times)\n");
+ fprintf(stderr, " -U # (DON'T save mers that occur more than # times)\n");
+ fprintf(stderr, " -m # (size of a mer; required)\n");
+ fprintf(stderr, " -c # (homopolymer compression; optional)\n");
+ fprintf(stderr, " -p (enable positions)\n");
+ fprintf(stderr, " -s seq.fasta (sequence to build the table for)\n");
+ fprintf(stderr, " -o tblprefix (output table prefix)\n");
+ fprintf(stderr, " -v (entertain the user)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " By default, the computation is done as one large sequential process.\n");
+ fprintf(stderr, " Multi-threaded operation is possible, at additional memory expense, as\n");
+ fprintf(stderr, " is segmented operation, at additional I/O expense.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Threaded operation: Split the counting in to n almost-equally sized\n");
+ fprintf(stderr, " pieces. This uses an extra h MB (from -P) per thread.\n");
+ fprintf(stderr, " -threads n (use n threads to build)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Segmented, sequential operation: Split the counting into pieces that\n");
+ fprintf(stderr, " will fit into no more than m MB of memory, or into n equal sized pieces.\n");
+ fprintf(stderr, " Each piece is computed sequentially, and the results are merged at the end.\n");
+ fprintf(stderr, " Only one of -memory and -segments is needed.\n");
+ fprintf(stderr, " -memory mMB (use at most m MB of memory per segment)\n");
+ fprintf(stderr, " -segments n (use n segments)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Segmented, batched operation: Same as sequential, except this allows\n");
+ fprintf(stderr, " each segment to be manually executed in parallel.\n");
+ fprintf(stderr, " Only one of -memory and -segments is needed.\n");
+ fprintf(stderr, " -memory mMB (use at most m MB of memory per segment)\n");
+ fprintf(stderr, " -segments n (use n segments)\n");
+ fprintf(stderr, " -configbatch (create the batches)\n");
+ fprintf(stderr, " -countbatch n (run batch number n)\n");
+ fprintf(stderr, " -mergebatch (merge the batches)\n");
+ fprintf(stderr, " Initialize the compute with -configbatch, which needs all the build options.\n");
+ fprintf(stderr, " Execute all -countbatch jobs, then -mergebatch to complete.\n");
+ fprintf(stderr, " meryl -configbatch -B [options] -o file\n");
+ fprintf(stderr, " meryl -countbatch 0 -o file\n");
+ fprintf(stderr, " meryl -countbatch 1 -o file\n");
+ fprintf(stderr, " ...\n");
+ fprintf(stderr, " meryl -countbatch N -o file\n");
+ fprintf(stderr, " meryl -mergebatch N -o file\n");
+ fprintf(stderr, " Batched mode can run on the grid.\n");
+ fprintf(stderr, " -sge jobname unique job name for this execution. Meryl will submit\n");
+ fprintf(stderr, " jobs with name mpjobname, ncjobname, nmjobname, for\n");
+ fprintf(stderr, " phases prepare, count and merge.\n");
+ fprintf(stderr, " -sgebuild \"options\" any additional options to sge, e.g.,\n");
+ fprintf(stderr, " -sgemerge \"options\" \"-p -153 -pe thread 2 -A merylaccount\"\n");
+ fprintf(stderr, " N.B. - -N will be ignored\n");
+ fprintf(stderr, " N.B. - be sure to quote the options\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "-M: Given a list of tables, perform a math, logical or threshold operation.\n");
+ fprintf(stderr, " Unless specified, all operations take any number of databases.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Math operations are:\n");
+ fprintf(stderr, " min count is the minimum count for all databases. If the mer\n");
+ fprintf(stderr, " does NOT exist in all databases, the mer has a zero count, and\n");
+ fprintf(stderr, " is NOT in the output.\n");
+ fprintf(stderr, " minexist count is the minimum count for all databases that contain the mer\n");
+ fprintf(stderr, " max count is the maximum count for all databases\n");
+ fprintf(stderr, " add count is sum of the counts for all databases\n");
+ fprintf(stderr, " sub count is the first minus the second (binary only)\n");
+ fprintf(stderr, " abs count is the absolute value of the first minus the second (binary only)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Logical operations are:\n");
+ fprintf(stderr, " and outputs mer iff it exists in all databases\n");
+ fprintf(stderr, " nand outputs mer iff it exists in at least one, but not all, databases\n");
+ fprintf(stderr, " or outputs mer iff it exists in at least one database\n");
+ fprintf(stderr, " xor outputs mer iff it exists in an odd number of databases\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Threshold operations are:\n");
+ fprintf(stderr, " lessthan x outputs mer iff it has count < x\n");
+ fprintf(stderr, " lessthanorequal x outputs mer iff it has count <= x\n");
+ fprintf(stderr, " greaterthan x outputs mer iff it has count > x\n");
+ fprintf(stderr, " greaterthanorequal x outputs mer iff it has count >= x\n");
+ fprintf(stderr, " equal x outputs mer iff it has count == x\n");
+ fprintf(stderr, " Threshold operations work on exactly one database.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -s tblprefix (use tblprefix as a database)\n");
+ fprintf(stderr, " -o tblprefix (create this output)\n");
+ fprintf(stderr, " -v (entertain the user)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " NOTE: Multiple tables are specified with multiple -s switches; e.g.:\n");
+ fprintf(stderr, " %s -M add -s 1 -s 2 -s 3 -s 4 -o all\n", execName);
+ fprintf(stderr, " NOTE: It is NOT possible to specify more than one operation:\n");
+ fprintf(stderr, " %s -M add -s 1 -s 2 -sub -s 3\n", execName);
+ fprintf(stderr, " will NOT work.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "-D: Dump the table (not all of these work).\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -Dd Dump a histogram of the distance between the same mers.\n");
+ fprintf(stderr, " -Dt Dump mers >= a threshold. Use -n to specify the threshold.\n");
+ fprintf(stderr, " -Dc Count the number of mers, distinct mers and unique mers.\n");
+ fprintf(stderr, " -Dh Dump (to stdout) a histogram of mer counts.\n");
+ fprintf(stderr, " -s Read the count table from here (leave off the .mcdat or .mcidx).\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "\n");
+}
+
+
+
+void
+merylArgs::clear(void) {
+
+ execName = 0L;
+ options = 0L;
+
+ beVerbose = false;
+ doForward = true;
+ doReverse = false;
+ doCanonical = false;
+
+ inputFile = 0L;
+ outputFile = 0L;
+ queryFile = 0L;
+
+ merSize = 20;
+ merComp = 0;
+ positionsEnabled = false;
+
+ numMersEstimated = 0;
+ numMersActual = 0;
+
+ numBasesActual = 0;
+
+ mersPerBatch = 0;
+ basesPerBatch = 0;
+
+ numBuckets = 0;
+ numBuckets_log2 = 0;
+ merDataWidth = 0;
+ merDataMask = uint64ZERO;
+ bucketPointerWidth = 0;
+
+ numThreads = 0;
+ memoryLimit = 0;
+ segmentLimit = 0;
+ configBatch = false;
+ countBatch = false;
+ mergeBatch = false;
+ batchNumber = 0;
+
+ sgeJobName = 0L;
+ sgeBuildOpt = 0L;
+ sgeMergeOpt = 0L;
+ isOnGrid = false;
+
+ lowCount = 0;
+ highCount = ~lowCount;
+ desiredCount = 0;
+
+ outputCount = 0;
+ outputAll = 0;
+ outputPosition = 0;
+
+ mergeFilesMax = 0;
+ mergeFilesLen = 0;
+ mergeFiles = 0L;
+
+ personality = 0;
+}
+
+
+
+
+merylArgs::merylArgs(int argc, char **argv) {
+
+ clear();
+
+ execName = duplString(argv[0]);
+
+ if (argc == 1) {
+ usage();
+ exit(1);
+ }
+
+ // Count how many '-s' switches there are, then allocate space
+ // for them in mergeFiles. We also sum the length of all options,
+ // so we can copy them into an 'options' string used when we
+ // resubmit to the grid.
+ //
+ uint32 optionsLen = 0;
+ for (int arg=1; arg < argc; arg++) {
+ optionsLen += strlen(argv[arg]) + 1;
+ if (strcmp(argv[arg], "-s") == 0)
+ mergeFilesMax++;
+ }
+
+ mergeFiles = new char * [mergeFilesMax];
+ options = new char [2 * optionsLen + 1];
+ options[0] = 0;
+
+ bool fail = false;
+
+ char *optptr = options;
+
+ for (int arg=1; arg < argc; arg++) {
+ if (arg > 1)
+ *optptr++ = ' ';
+
+ // Arg! If the arg has spaces or other stuff that the shell
+ // needs escaped we need to escape them again. So, we copy byte
+ // by byte and insert escapes at the right points.
+
+ for (char *op=argv[arg]; *op; op++, optptr++) {
+ if (isspace(*op) || !isalnum(*op))
+ if ((*op != '-') && (*op != '_') && (*op != '.') && (*op != '/'))
+ *optptr++ = '\\';
+ *optptr = *op;
+ }
+
+ //strcat(options, argv[arg]);
+ }
+
+
+ // Parse the options
+ //
+ for (int arg=1; arg < argc; arg++) {
+ if (strncmp(argv[arg], "-V", 2) == 0) {
+ fprintf(stdout, "meryl the Mighty Mer Counter version (no version)\n");
+ exit(0);
+ } else if (strcmp(argv[arg], "-m") == 0) {
+ arg++;
+ merSize = strtouint32(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "-c") == 0) {
+ arg++;
+ merComp = strtouint32(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "-p") == 0) {
+ positionsEnabled = true;
+ } else if (strcmp(argv[arg], "-s") == 0) {
+ arg++;
+ delete [] inputFile;
+ inputFile = duplString(argv[arg]);
+ mergeFiles[mergeFilesLen++] = duplString(argv[arg]);
+ } else if (strcmp(argv[arg], "-n") == 0) {
+ arg++;
+ numMersEstimated = strtouint64(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "-f") == 0) {
+ doForward = true;
+ doReverse = false;
+ doCanonical = false;
+ } else if (strcmp(argv[arg], "-r") == 0) {
+ doForward = false;
+ doReverse = true;
+ doCanonical = false;
+ } else if (strcmp(argv[arg], "-C") == 0) {
+ doForward = false;
+ doReverse = false;
+ doCanonical = true;
+ } else if (strcmp(argv[arg], "-L") == 0) {
+ arg++;
+ lowCount = strtouint32(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "-U") == 0) {
+ arg++;
+ highCount = strtouint32(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "-o") == 0) {
+ arg++;
+ delete [] outputFile;
+ outputFile = duplString(argv[arg]);
+ } else if (strcmp(argv[arg], "-v") == 0) {
+ beVerbose = true;
+
+ } else if (strcmp(argv[arg], "-P") == 0) {
+ personality = 'P';
+ } else if (strcmp(argv[arg], "-B") == 0) {
+ personality = 'B';
+ } else if (strcmp(argv[arg], "-S") == 0) {
+ personality = 'S';
+ } else if (strcmp(argv[arg], "-M") == 0) {
+ arg++;
+ if (strcmp(argv[arg], "merge") == 0) {
+ personality = PERSONALITY_MERGE;
+ } else if (strcmp(argv[arg], "min") == 0) {
+ personality = PERSONALITY_MIN;
+ } else if (strcmp(argv[arg], "minexist") == 0) {
+ personality = PERSONALITY_MINEXIST;
+ } else if (strcmp(argv[arg], "max") == 0) {
+ personality = PERSONALITY_MAX;
+ } else if (strcmp(argv[arg], "maxexist") == 0) {
+ personality = PERSONALITY_MAXEXIST;
+ } else if (strcmp(argv[arg], "add") == 0) {
+ personality = PERSONALITY_ADD;
+ } else if (strcmp(argv[arg], "sub") == 0) {
+ personality = PERSONALITY_SUB;
+ } else if (strcmp(argv[arg], "abs") == 0) {
+ personality = PERSONALITY_ABS;
+ } else if (strcmp(argv[arg], "divide") == 0) {
+ personality = PERSONALITY_DIVIDE;
+ } else if (strcmp(argv[arg], "and") == 0) {
+ personality = PERSONALITY_AND;
+ } else if (strcmp(argv[arg], "nand") == 0) {
+ personality = PERSONALITY_NAND;
+ } else if (strcmp(argv[arg], "or") == 0) {
+ personality = PERSONALITY_OR;
+ } else if (strcmp(argv[arg], "xor") == 0) {
+ personality = PERSONALITY_XOR;
+ } else if (strcmp(argv[arg], "lessthan") == 0) {
+ personality = PERSONALITY_LEQ;
+ arg++;
+ desiredCount = strtouint32(argv[arg], 0L) - 1;
+ } else if (strcmp(argv[arg], "lessthanorequal") == 0) {
+ personality = PERSONALITY_LEQ;
+ arg++;
+ desiredCount = strtouint32(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "greaterthan") == 0) {
+ personality = PERSONALITY_GEQ;
+ arg++;
+ desiredCount = strtouint32(argv[arg], 0L) + 1;
+ } else if (strcmp(argv[arg], "greaterthanorequal") == 0) {
+ personality = PERSONALITY_GEQ;
+ arg++;
+ desiredCount = strtouint32(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "equal") == 0) {
+ personality = PERSONALITY_EQ;
+ arg++;
+ desiredCount = strtouint32(argv[arg], 0L);
+ } else {
+ fprintf(stderr, "ERROR: unknown math personality %s\n", argv[arg]);
+ exit(1);
+ }
+ } else if (strcmp(argv[arg], "-Dd") == 0) {
+ personality = 'd';
+ } else if (strcmp(argv[arg], "-Dt") == 0) {
+ personality = 't';
+ } else if (strcmp(argv[arg], "-Dp") == 0) {
+ personality = 'p';
+ } else if (strcmp(argv[arg], "-Dc") == 0) {
+ personality = 'c';
+ } else if (strcmp(argv[arg], "-Dh") == 0) {
+ personality = 'h';
+ } else if (strcmp(argv[arg], "-memory") == 0) {
+ arg++;
+ memoryLimit = strtouint64(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "-segments") == 0) {
+ arg++;
+ segmentLimit = strtouint64(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "-threads") == 0) {
+ arg++;
+ numThreads = strtouint32(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "-configbatch") == 0) {
+ personality = 'B';
+ configBatch = true;
+ countBatch = false;
+ mergeBatch = false;
+ batchNumber = uint32ZERO;
+ } else if (strcmp(argv[arg], "-countbatch") == 0) {
+ arg++;
+ personality = 'B';
+ configBatch = false;
+ countBatch = true;
+ mergeBatch = false;
+ batchNumber = strtouint32(argv[arg], 0L);
+ } else if (strcmp(argv[arg], "-mergebatch") == 0) {
+ personality = 'B';
+ configBatch = false;
+ countBatch = false;
+ mergeBatch = true;
+ batchNumber = uint32ZERO;
+ } else if (strcmp(argv[arg], "-sge") == 0) {
+ sgeJobName = argv[++arg];
+ } else if (strcmp(argv[arg], "-sgebuild") == 0) {
+ sgeBuildOpt = argv[++arg];
+ } else if (strcmp(argv[arg], "-sgemerge") == 0) {
+ sgeMergeOpt = argv[++arg];
+ } else if (strcmp(argv[arg], "-forcebuild") == 0) {
+ isOnGrid = true;
+ } else {
+ fprintf(stderr, "Unknown option '%s'.\n", argv[arg]);
+ fail = true;
+ }
+ }
+
+ // Using threads is only useful if we are not a batch.
+ //
+ if ((numThreads > 0) && (configBatch || countBatch || mergeBatch)) {
+ if (configBatch)
+ fprintf(stderr, "WARNING: -threads has no effect with -configbatch, disabled.\n");
+ if (countBatch)
+ fprintf(stderr, "WARNING: -threads has no effect with -countbatch, disabled.\n");
+ if (mergeBatch)
+ fprintf(stderr, "WARNING: -threads has no effect with -mergebatch, disabled.\n");
+ numThreads = 0;
+ }
+
+ // SGE is not useful unless we are in batch mode.
+ //
+ if (sgeJobName && !configBatch && !countBatch && !mergeBatch) {
+ fprintf(stderr, "ERROR: -sge not useful unless in batch mode (replace -B with -configbatch)\n");
+ exit(1);
+ }
+
+ if (fail)
+ exit(1);
+}
+
+
+
+merylArgs::merylArgs(const char *prefix) {
+
+ clear();
+
+ char *filename = new char [strlen(prefix) + 17];
+ sprintf(filename, "%s.merylArgs", prefix);
+
+ errno = 0;
+ FILE *F = fopen(filename, "rb");
+ if (errno) {
+ fprintf(stderr, "merylArgs::readConfig()-- Failed to open '%s': %s\n", filename, strerror(errno));
+ exit(1);
+ }
+
+ char magic[17] = {0};
+ fread(magic, sizeof(char), 16, F);
+ if (strncmp(magic, "merylBatcherv02", 16) != 0) {
+ fprintf(stderr, "merylArgs::readConfig()-- '%s' doesn't appear to be a merylArgs file.\n", filename);
+ exit(1);
+ }
+
+ // Load the config, then reset the pointers.
+
+ fread(this, sizeof(merylArgs), 1, F);
+
+ execName = readString(F);
+ options = 0L;
+ inputFile = readString(F);
+ outputFile = readString(F);
+ queryFile = 0L;
+ sgeJobName = readString(F);
+ sgeBuildOpt = readString(F);
+ sgeMergeOpt = readString(F);
+
+ mergeFiles = new char* [mergeFilesLen];
+
+ for (uint32 i=0; i<mergeFilesLen; i++)
+ mergeFiles[i] = readString(F);
+
+ fclose(F);
+
+ delete [] filename;
+}
+
+
+
+merylArgs::~merylArgs() {
+ delete [] execName;
+ delete [] options;
+ delete [] inputFile;
+ delete [] outputFile;
+
+ for (uint32 i=0; i<mergeFilesLen; i++)
+ delete [] mergeFiles[i];
+
+ delete [] mergeFiles;
+}
+
+
+
+bool
+merylArgs::writeConfig(void) {
+ char *filename;
+
+ filename = new char [strlen(outputFile) + 17];
+ sprintf(filename, "%s.merylArgs", outputFile);
+
+ errno = 0;
+ FILE *F = fopen(filename, "wb");
+ if (errno) {
+ fprintf(stderr, "merylArgs::writeConfig()-- Failed to open '%s': %s\n", filename, strerror(errno));
+ exit(1);
+ }
+
+ fwrite("merylBatcherv02", sizeof(char), 16, F);
+
+ fwrite(this, sizeof(merylArgs), 1, F);
+
+ writeString(execName, F);
+ writeString(inputFile, F);
+ writeString(outputFile, F);
+ writeString(sgeJobName, F);
+ writeString(sgeBuildOpt, F);
+ writeString(sgeMergeOpt, F);
+
+ for (uint32 i=0; i<mergeFilesLen; i++)
+ writeString(mergeFiles[i], F);
+
+ fclose(F);
+
+ return(true);
+}
diff --git a/meryl/asmMerQC-regions.C b/meryl/asmMerQC-regions.C
new file mode 100644
index 0000000..067f820
--- /dev/null
+++ b/meryl/asmMerQC-regions.C
@@ -0,0 +1,1023 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+// This reads the assembly frgctg, varctg and merQC badmers, computes
+// the number and location of bad-mer, bad-var regions, and their
+// depth, in contig space.
+//
+// File paths are hardcoded.
+// This code ONLY works on 64-bit hardware, but it's easy to fix.
+
+using namespace std;
+#include <map>
+
+//
+// Begin code from Bri's intervalList.H, intervalList.C, splitToWords.H
+//
+typedef unsigned long uint64;
+typedef unsigned int uint32;
+
+#define uint64FMT "%lu"
+#define uint32FMT "%u"
+#define uint32FMTW(X) "%" #X "u"
+
+#define strtouint32(N,O) (uint32)strtoul(N, O, 10)
+#define strtouint64(N,O) (uint64)strtoul(N, O, 10)
+
+class splitToWords {
+public:
+ splitToWords() {
+ _argWords = 0;
+ _maxWords = 0;
+ _arg = 0L;
+ _maxChars = 0;
+ _cmd = 0L;
+ };
+ splitToWords(char *cmd) {
+ _argWords = 0;
+ _maxWords = 0;
+ _arg = 0L;
+ _maxChars = 0;
+ _cmd = 0L;
+
+ split(cmd);
+ };
+ ~splitToWords() {
+ delete [] _cmd;
+ delete [] _arg;
+ };
+
+
+ void split(char *cmd) {
+
+ // Step Zero:
+ //
+ // Count the length of the string, in words and in characters.
+ // For simplicity, we overcount words, by just counting white-space.
+ //
+ // Then, allocate space for a temporary copy of the string, and a
+ // set of pointers into the temporary copy (much like argv).
+ //
+ uint32 cmdChars = 1; // 1 == Space for terminating 0
+ uint32 cmdWords = 2; // 2 == Space for first word and terminating 0L
+
+ for (char *tmp=cmd; *tmp; tmp++) {
+ cmdWords += *tmp == ' ';
+ cmdWords += *tmp == '\t';
+ cmdChars++;
+ }
+
+ if (cmdChars > _maxChars) {
+ delete [] _cmd;
+ _cmd = new char [cmdChars];
+ _maxChars = cmdChars;
+ }
+ if (cmdWords > _maxWords) {
+ delete [] _arg;
+ _arg = new char * [cmdWords];
+ _maxWords = cmdWords;
+ }
+
+ _argWords = 0;
+
+ // Step One:
+ //
+ // Determine where the words are in the command string, copying the
+ // string to _cmd and storing words in _arg.
+ //
+ bool isFirst = true;
+ char *cmdI = cmd;
+ char *cmdO = _cmd;
+
+ while (*cmdI) {
+
+ // If we are at a non-space character, we are in a word. If
+ // this is the first character in the word, save the word in
+ // the args list.
+ //
+ // Otherwise we are at a space and thus not in a word. Make
+ // all spaces be string terminators, and declare that we are
+ // at the start of a word.
+ //
+ if ((*cmdI != ' ') && (*cmdI != '\t')) {
+ *cmdO = *cmdI;
+
+ if (isFirst) {
+ _arg[_argWords++] = cmdO;
+ isFirst = false;
+ }
+ } else {
+ *cmdO = 0;
+ isFirst = true;
+ }
+
+ cmdI++;
+ cmdO++;
+ }
+
+ // Finish off the list by terminating the last arg, and
+ // terminating the list of args.
+ //
+ *cmdO = 0;
+ _arg[_argWords] = 0L;
+ };
+
+
+ uint32 numWords(void) { return(_argWords); };
+ char *getWord(uint32 i) { return(_arg[i]); };
+ char *operator[](uint32 i) { return(_arg[i]); };
+private:
+ uint32 _argWords;
+ uint32 _maxWords;
+ char **_arg;
+ uint32 _maxChars;
+ char *_cmd;
+};
+
+
+
+
+typedef uint64 intervalNumber;
+
+struct _intervalPair {
+ intervalNumber lo;
+ intervalNumber hi;
+};
+
+struct _intervalDepth {
+ intervalNumber lo;
+ intervalNumber hi;
+ uint32 de;
+};
+
+
+class intervalList {
+public:
+ intervalList();
+ ~intervalList();
+
+ intervalList &operator=(intervalList &src);
+
+ // Clear a list
+ void clear(void) {
+ _isSorted = true;
+ _isMerged = true;
+ _listLen = 0;
+ }
+
+ // Insert a new interval into the list
+ void add(intervalNumber position, intervalNumber length);
+
+ // Sort the set of intervals by the lo value
+ void sort(void);
+
+ // Merge overlapping or adjacent intervals together.
+ void merge(void);
+
+ void invert(intervalNumber lo, intervalNumber hi);
+
+ // Returns the number of intervals
+ uint32 numberOfIntervals(void) {
+ return(_listLen);
+ };
+
+ // Returns the sum of the length of all intervals
+ intervalNumber sumOfLengths(void) {
+ intervalNumber len = 0;
+ uint32 i = numberOfIntervals();
+
+ if (i > 0)
+ while (i--)
+ len += _list[i].hi - _list[i].lo;
+
+ return(len);
+ };
+
+ // Populates an array with the intervals that are within the
+ // supplied interval. Return
+ //
+ uint32 overlapping(intervalNumber lo,
+ intervalNumber hi,
+ uint32 *&intervals,
+ uint32 &intervalsLen,
+ uint32 &intervalsMax);
+
+ // Populates this intervalList with the intersection of A and B.
+ // This intervalList is not cleared prior to adding new intervals.
+ //
+ // Both A and B call merge().
+ //
+ void intersect(intervalList &A,
+ intervalList &B);
+
+ // Populates this intervalList with regions in A that are completely
+ // contained in a region in B.
+ //
+ // Both A and B call merge().
+ //
+ void contained(intervalList &A,
+ intervalList &B);
+
+
+ intervalNumber lo(uint32 i) { return(_list[i].lo); };
+ intervalNumber hi(uint32 i) { return(_list[i].hi); };
+
+private:
+ bool _isSorted;
+ bool _isMerged;
+ uint32 _listLen;
+ uint32 _listMax;
+ _intervalPair *_list;
+};
+
+
+
+// Takes as input an intervalList, computes the number of intervals
+// covering every position in there, stores this as a new set of
+// intervals, annotated with the depth.
+//
+// This is a static object, initialized once by the intervalList.
+//
+class intervalDepth {
+public:
+ intervalDepth(intervalList &IL);
+ ~intervalDepth();
+
+ // Returns the number of intervals
+ uint32 numberOfIntervals(void) {
+ return(_listLen);
+ };
+
+ intervalNumber lo(uint32 i) { return(_list[i].lo); };
+ intervalNumber hi(uint32 i) { return(_list[i].hi); };
+ uint32 de(uint32 i) { return(_list[i].de); };
+
+private:
+ uint32 _listLen;
+ uint32 _listMax;
+ _intervalDepth *_list;
+};
+
+
+intervalList::intervalList() {
+ _isSorted = true;
+ _isMerged = true;
+ _listLen = 0;
+ _listMax = 16;
+ _list = new _intervalPair [_listMax];
+}
+
+
+intervalList::~intervalList() {
+ delete [] _list;
+}
+
+
+intervalList &
+intervalList::operator=(intervalList &src) {
+ _isSorted = src._isSorted;
+ _isMerged = src._isMerged;
+ _listLen = src._listLen;
+
+ if (_listMax < src._listMax) {
+ delete [] _list;
+ _listMax = src._listMax;
+ _list = new _intervalPair [_listMax];
+ }
+
+ memcpy(_list, src._list, _listLen * sizeof(_intervalPair));
+
+ return(*this);
+}
+
+
+void
+intervalList::add(intervalNumber position, intervalNumber length) {
+
+ if (_listLen >= _listMax) {
+ _listMax *= 2;
+ _intervalPair *l = new _intervalPair [_listMax];
+ memcpy(l, _list, sizeof(_intervalPair) * _listLen);
+ delete [] _list;
+ _list = l;
+ }
+
+ _list[_listLen].lo = position;
+ _list[_listLen].hi = position + length;
+
+#if 0
+ // Aborted attempt to add a data field here. Got stuck
+ // deciding how to handle merges lightweight
+
+ _list[_listLen].data = 0L;
+
+ if (data != ~uint64ZERO) {
+ _list[_listLen].dataLen = 1;
+ _list[_listLen].dataMax = 4;
+ _list[_listLen].data = new uint64 [_list[_listLen].dataMax];
+ _list[_listLen].data[0] = data;
+ }
+#endif
+
+ if ((_listLen > 0) &&
+ (_list[_listLen-1].lo > _list[_listLen].lo)) {
+ _isSorted = false;
+ _isMerged = false;
+ }
+
+ _listLen++;
+}
+
+
+static
+int
+intervalList_sort_helper(const void *a, const void *b) {
+ _intervalPair *A = (_intervalPair *)a;
+ _intervalPair *B = (_intervalPair *)b;
+
+ if (A->lo < B->lo) return(-1);
+ if (A->lo > B->lo) return(1);
+ if (A->hi < B->hi) return(-1);
+ if (A->hi > B->hi) return(1);
+ return(0);
+}
+
+
+void
+intervalList::sort(void) {
+
+ if (_isSorted)
+ return;
+
+ if (_listLen > 1)
+ qsort(_list, _listLen, sizeof(_intervalPair), intervalList_sort_helper);
+
+ _isSorted = true;
+}
+
+
+void
+intervalList::merge(void) {
+ uint32 thisInterval = 0;
+ uint32 nextInterval = 1;
+
+ if (_listLen < 2)
+ return;
+
+ sort();
+
+ while (nextInterval < _listLen) {
+
+ if ((_list[thisInterval].lo == 0) &&
+ (_list[thisInterval].hi == 0)) {
+
+ // Our interval is empty. Copy in the interval we are
+ // examining and move to the next.
+
+ // XXX This is probably useless, thisInterval should always be
+ // valid.
+
+ _list[thisInterval].lo = _list[nextInterval].lo;
+ _list[thisInterval].hi = _list[nextInterval].hi;
+
+ _list[nextInterval].lo = 0;
+ _list[nextInterval].hi = 0;
+ nextInterval++;
+ } else {
+
+ // This interval is valid. See if it overlaps with the next
+ // interval.
+
+ if (_list[thisInterval].hi >= _list[nextInterval].lo) {
+
+ // Got an intersection.
+
+ // Merge nextInterval into thisInterval -- the hi range
+ // is extended if the nextInterval range is larger.
+ //
+ if (_list[thisInterval].hi < _list[nextInterval].hi)
+ _list[thisInterval].hi = _list[nextInterval].hi;
+
+ // Clear the just merged nextInterval and move to the next one.
+ //
+ _list[nextInterval].lo = 0;
+ _list[nextInterval].hi = 0;
+ nextInterval++;
+ } else {
+
+ // No intersection. Move along. Nothing to see here.
+
+ // If there is a gap between the target and the examine (we
+ // must have merged sometime in the past), copy examine to
+ // the next target.
+
+ thisInterval++;
+
+ if (thisInterval != nextInterval) {
+ _list[thisInterval].lo = _list[nextInterval].lo;
+ _list[thisInterval].hi = _list[nextInterval].hi;
+ }
+
+ nextInterval++;
+ }
+ }
+ }
+
+ if (thisInterval+1 < _listLen)
+ _listLen = thisInterval + 1;
+
+ _isMerged = true;
+}
+
+
+void
+intervalList::invert(intervalNumber lo, intervalNumber hi) {
+
+ if (!_isSorted || !_isMerged) {
+ fprintf(stderr, "intervalList::invert()-- ERROR! List is not sorted or not merged!\n");
+ exit(1);
+ }
+
+ // Create a new list to store the inversion
+ //
+ uint32 invLen = 0;
+ uint32 invMax = _listLen + 2;
+ _intervalPair *inv = new _intervalPair [invMax];
+
+ // Add the first
+ //
+ if (lo < _list[0].lo) {
+ inv[invLen].lo = lo;
+ inv[invLen].hi = _list[0].lo;
+ invLen++;
+ }
+
+ // Add the pieces
+ for (uint32 i=1; i<_listLen; i++) {
+ if (_list[i-1].hi < _list[i].lo) {
+ inv[invLen].lo = _list[i-1].hi;
+ inv[invLen].hi = _list[i].lo;
+ invLen++;
+ }
+ }
+
+ // Add the last
+ if (_list[_listLen-1].hi < hi) {
+ inv[invLen].lo = _list[_listLen-1].hi;
+ inv[invLen].hi = hi;
+ invLen++;
+ }
+
+ // Nuke the old list, swap in the new one
+ delete [] _list;
+
+ _list = inv;
+ _listLen = invLen;
+ _listMax = invMax;
+}
+
+
+
+uint32
+intervalList::overlapping(intervalNumber rangelo,
+ intervalNumber rangehi,
+ uint32 *&intervals,
+ uint32 &intervalsLen,
+ uint32 &intervalsMax) {
+
+
+ // XXX: Naive implementation that is easy to verify (and that works
+ // on an unsorted list).
+
+ if (intervals == 0L) {
+ intervalsMax = 256;
+ intervals = new uint32 [intervalsMax];
+ }
+
+ intervalsLen = 0;
+
+ for (uint32 i=0; i<_listLen; i++) {
+ if ((rangelo <= _list[i].hi) &&
+ (rangehi >= _list[i].lo)) {
+ if (intervalsLen >= intervalsMax) {
+ intervalsMax *= 2;
+ uint32 *X = new uint32 [intervalsMax];
+ memcpy(X, intervals, sizeof(uint32) * intervalsLen);
+ delete [] intervals;
+ intervals = X;
+ }
+
+ intervals[intervalsLen++] = i;
+ }
+ }
+
+ return(intervalsLen);
+}
+
+
+
+void
+intervalList::intersect(intervalList &A,
+ intervalList &B) {
+ A.merge();
+ B.merge();
+
+ uint32 ai = 0;
+ uint32 bi = 0;
+
+ while ((ai < A.numberOfIntervals()) &&
+ (bi < B.numberOfIntervals())) {
+ uint32 al = A.lo(ai);
+ uint32 ah = A.hi(ai);
+ uint32 bl = B.lo(bi);
+ uint32 bh = B.hi(bi);
+ uint32 nl = 0;
+ uint32 nh = 0;
+
+ // If they intersect, make a new region
+ //
+ if ((al <= bl) && (bl < ah)) {
+ nl = bl;
+ nh = (ah < bh) ? ah : bh;
+ }
+
+ if ((bl <= al) && (al < bh)) {
+ nl = al;
+ nh = (ah < bh) ? ah : bh;
+ }
+
+ if (nl < nh)
+ add(nl, nh - nl);
+
+ // Advance the list with the earlier region.
+ //
+ if (ah < bh) {
+ // A ends before B
+ ai++;
+ } else if (ah > bh) {
+ // B ends before A
+ bi++;
+ } else {
+ // Exactly the same ending!
+ ai++;
+ bi++;
+ }
+ }
+}
+
+void
+intervalList::contained(intervalList &A,
+ intervalList &B) {
+ A.merge();
+ B.merge();
+
+ uint32 ai = 0;
+ uint32 bi = 0;
+
+ while ((ai < A.numberOfIntervals()) &&
+ (bi < B.numberOfIntervals())) {
+ uint32 al = A.lo(ai);
+ uint32 ah = A.hi(ai);
+ uint32 bl = B.lo(bi);
+ uint32 bh = B.hi(bi);
+
+ // If A is contained in B, make a new region.
+ //
+ if ((bl <= al) && (ah <= bh))
+ add(bl, bh - bl);
+
+#if 0
+ if ((al <= bl) && (bh <= ah))
+ add(al, ah - al);
+#endif
+
+ // Advance the list with the earlier region.
+ //
+ if (ah < bh) {
+ // A ends before B
+ ai++;
+ } else if (ah > bh) {
+ // B ends before A
+ bi++;
+ } else {
+ // Exactly the same ending!
+ ai++;
+ bi++;
+ }
+ }
+}
+
+
+
+
+
+
+static
+int
+intervalDepth_sort_helper(const void *a, const void *b) {
+ _intervalDepth *A = (_intervalDepth *)a;
+ _intervalDepth *B = (_intervalDepth *)b;
+
+ if (A->lo < B->lo) return(-1);
+ if (A->lo > B->lo) return(1);
+ return(0);
+}
+
+
+intervalDepth::intervalDepth(intervalList &IL) {
+
+ uint32 idlen = IL.numberOfIntervals() * 2;
+ _intervalDepth *id = new _intervalDepth [idlen];
+
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++) {
+ id[2*i ].lo = IL.lo(i);
+ id[2*i ].hi = 0;
+ id[2*i ].de = 1;
+ id[2*i+1].lo = IL.hi(i);
+ id[2*i+1].hi = 0;
+ id[2*i+1].de = 0;
+ }
+
+ qsort(id, idlen, sizeof(_intervalDepth), intervalDepth_sort_helper);
+
+ // Scan the list, counting how many times we change depth.
+ //
+ _listMax = 1;
+ for (uint32 i=1; i<idlen; i++) {
+ if (id[i-1].lo != id[i].lo)
+ _listMax++;
+ }
+
+ // Allocate the real depth of coverage intervals
+ //
+ _listLen = 0;
+ _list = new _intervalDepth [_listMax];
+
+ // Build new intervals
+ //
+ // Initialize the first interval
+ //
+ _list[_listLen].lo = id[0].lo;
+ _list[_listLen].hi = id[0].lo;
+ _list[_listLen].de = 1;
+
+ for (uint32 i=1; i<idlen; i++) {
+
+ if (_list[_listLen].de == 0) {
+ // Update the start position if the current interval is at zero
+ // depth.
+ //
+ _list[_listLen].lo = id[i].lo;
+ } else {
+
+ // If we are at a position different from the start, we need to
+ // close out the current interval and make a new one.
+ //
+ if (id[i-1].lo != id[i].lo) {
+ _list[_listLen].hi = id[i].lo;
+
+ _listLen++;
+
+ _list[_listLen].lo = id[i].lo;
+ _list[_listLen].hi = id[i].lo;
+ _list[_listLen].de = _list[_listLen-1].de;
+ }
+ }
+
+ // Finally, update the depth of the current interval
+ //
+ if (id[i].de)
+ _list[_listLen].de++;
+ else
+ _list[_listLen].de--;
+ }
+
+ // Toss out the last one if it's zero length -- I think it's always
+ // zero length, just can convince myself.
+ //
+ if (_list[_listLen].lo == _list[_listLen].hi)
+ _listLen--;
+
+ delete [] id;
+}
+
+intervalDepth::~intervalDepth() {
+ delete [] _list;
+}
+
+//
+// End code from Bri's libutil/util++.H
+//
+
+
+
+void
+readDepth(char *depthname, map<uint64,intervalDepth*> &lowCoverage) {
+ char line[1024] = {0};
+ map<uint64,intervalList*> ILs;
+
+ fprintf(stderr, "Reading depth from '%s'\n", depthname);
+
+ errno = 0;
+ FILE *F = fopen(depthname, "r");
+ if (errno)
+ fprintf(stderr, "failed to open '%s': %s\n", depthname, strerror(errno)), exit(1);
+
+ uint32 i=0;
+
+ fgets(line, 1024, F);
+ while (!feof(F)) {
+ splitToWords W(line);
+
+ uint64 uid = strtouint64(W[1], 0L);
+ uint32 beg = strtouint32(W[2], 0L);
+ uint32 end = strtouint32(W[3], 0L);
+
+ if (beg > end)
+ fprintf(stderr, "ERROR: l="uint32FMT" h="uint32FMT"\n", beg, end);
+
+ if (ILs[uid] == 0L)
+ ILs[uid] = new intervalList();
+ ILs[uid]->add(beg, end - beg);
+
+ i++;
+
+ fgets(line, 1024, F);
+ }
+
+ fclose(F);
+ fprintf(stderr, " "uint32FMT" lines.\n", i);
+
+ map<uint64,intervalList*>::iterator it = ILs.begin();
+ map<uint64,intervalList*>::iterator ed = ILs.end();
+
+ while (it != ed) {
+ lowCoverage[it->first] = new intervalDepth(*it->second);
+ delete it->second;
+ it->second = 0L;
+ it++;
+ }
+}
+
+
+void
+readVariation(char *depthname, map<uint64,intervalList*> &variation) {
+ char line[1024 * 1024] = {0};
+
+ fprintf(stderr, "Reading variation from '%s'\n", depthname);
+
+ errno = 0;
+ FILE *F = fopen(depthname, "r");
+ if (errno)
+ fprintf(stderr, "failed to open '%s': %s\n", depthname, strerror(errno)), exit(1);
+
+ uint32 i=0;
+
+ fgets(line, 1024 * 1024, F);
+ while (!feof(F)) {
+ splitToWords W(line);
+
+ uint64 uid = strtouint64(W[1], 0L);
+ uint32 beg = strtouint32(W[2], 0L);
+ uint32 end = strtouint32(W[3], 0L);
+
+ if (variation[uid] == 0L)
+ variation[uid] = new intervalList();
+ variation[uid]->add(beg, end - beg);
+
+ i++;
+
+ fgets(line, 1024 * 1024, F);
+ }
+
+ fclose(F);
+ fprintf(stderr, " "uint32FMT" lines.\n", i);
+}
+
+
+void
+readBadMers(char *depthname, map<uint64,intervalList*> &badMers) {
+ char line[1024] = {0};
+
+ fprintf(stderr, "Reading badMers from '%s'\n", depthname);
+
+ errno = 0;
+ FILE *F = fopen(depthname, "r");
+ if (errno)
+ fprintf(stderr, "failed to open '%s': %s\n", depthname, strerror(errno)), exit(1);
+
+ uint32 i=0;
+
+ fgets(line, 1024, F);
+ while (!feof(F)) {
+ splitToWords W(line);
+
+ // Change every non-digit to a space in the first word.
+ for (uint32 z=strlen(W[0])-1; z--; )
+ if (!isdigit(W[0][z]))
+ W[0][z] = ' ';
+
+ uint64 uid = strtouint64(W[0], 0L);
+ uint32 beg = strtouint32(W[3], 0L);
+ uint32 end = strtouint32(W[4], 0L);
+
+ if (badMers[uid] == 0L)
+ badMers[uid] = new intervalList();
+ badMers[uid]->add(beg, end - beg);
+
+ i++;
+
+ fgets(line, 1024, F);
+ }
+
+ fclose(F);
+ fprintf(stderr, " "uint32FMT" lines.\n", i);
+}
+
+
+
+int
+main(int argc, char **argv) {
+ map<uint64,intervalList*> badMers;
+ map<uint64,intervalList*> variation;
+ map<uint64,intervalDepth*> lowCoverage;
+
+ bool showDepthIntersect = false;
+ bool showVariantIntersect = false;
+ bool showVarDepthIntersect = false;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-D") == 0) {
+
+ } else if (strcmp(argv[arg], "-pd") == 0) {
+ showDepthIntersect = true;
+ } else if (strcmp(argv[arg], "-pv") == 0) {
+ showVariantIntersect = true;
+ } else if (strcmp(argv[arg], "-pvd") == 0) {
+ showVarDepthIntersect = true;
+ } else {
+ fprintf(stderr, "usage: %s [-D debugfile] [-pd] [-pv] [-pvd]\n", argv[0]);
+ fprintf(stderr, " -pd print bad mers regions isect depth\n");
+ fprintf(stderr, " -pv print bad mers regions isect variants\n");
+ fprintf(stderr, " -pvd print bad mers regions isect both variants and depth\n");
+ exit(1);
+ }
+ arg++;
+ }
+
+#if 1
+ // HuRef6, in the assembly directory.
+ //
+ readDepth ("/project/huref6/assembly/h6/9-terminator/h6.posmap.frgctg", lowCoverage);
+ readVariation("/project/huref6/assembly/h6/9-terminator/h6.posmap.varctg", variation);
+ readBadMers ("/project/huref6/assembly/h6-mer-validation/h6-ms22-allfrags-normalcontigs.badmers.0.singlecontig.zerofrag.badmers", badMers);
+#endif
+
+#if 0
+ // HuRef6, ws=25, in the assembly directory.
+ //
+ readDepth ("/project/huref6/assembly/h6/9-terminator-ws25/h6.posmap.frgctg", lowCoverage);
+ readVariation("/project/huref6/assembly/h6/9-terminator-ws25/h6.posmap.varctg", variation);
+ readBadMers ("/project/huref6/assembly/h6-mer-validation/h6-version4-ws25/h6-ms22-allfrags-normalcontigs.badmers.0.singlecontig.zerofrag.badmers", badMers);
+#endif
+
+#if 0
+ // Our scratch huref
+ //
+ readDepth ("/project/huref6/redo_consensus-gennady/mer-validation/h6tmp.posmap.frgctg", lowCoverage);
+ readVariation("/project/huref6/redo_consensus-gennady/mer-validation/h6tmp.posmap.varctg", variation);
+ readBadMers ("/project/huref6/redo_consensus-gennady/mer-validation/h6tmp-ms22-allfrags-allcontigs.badmers.0.singlecontig.zerofrag.badmers", badMers);
+#endif
+
+ uint32 badBegDepth[1024] = {0};
+ uint32 badEndDepth[1024] = {0};
+
+ uint32 badDepth[32][32];
+ for (uint32 i=0; i<32; i++)
+ for (uint32 j=0; j<32; j++)
+ badDepth[i][j] = 0;
+
+ map<uint64,intervalList*>::iterator it = badMers.begin();
+ map<uint64,intervalList*>::iterator ed = badMers.end();
+ while (it != ed) {
+ uint64 uid = it->first;
+
+ intervalList *Iv = variation[uid];
+ intervalList *Ib = badMers[uid];
+ intervalList *Ii = 0L;
+ intervalDepth *Id = lowCoverage[uid];
+
+ if (Iv)
+ Iv->merge();
+ if (Ib)
+ Ib->merge();
+
+ if (Iv && Ib) {
+ Ii = new intervalList();
+ Ii->intersect(*Iv, *Ib);
+ }
+
+
+ if (Ii) {
+ uint32 ii = 0;
+ uint32 id = 0;
+
+ while ((ii < Ii->numberOfIntervals()) &&
+ (id < Id->numberOfIntervals())) {
+
+ // We want to count the number of times a badmer region
+ // begins/ends in some depth.
+
+ //fprintf(stderr, "testing beg "uint32FMT" "uint32FMT" -- "uint32FMT" "uint32FMT"\n",
+ // Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id));
+
+ uint32 beg = 0;
+ uint32 end = 0;
+
+ // Low points are not allowed to be equal to high points, skip to the next
+ while ((id < Id->numberOfIntervals()) &&
+ (Id->hi(id) <= Ii->lo(ii))) {
+ id++;
+ //fprintf(stderr, "testing beg (m) "uint32FMT" "uint32FMT" -- "uint32FMT" "uint32FMT"\n",
+ // Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id));
+ }
+ if (id < Id->numberOfIntervals()) {
+ uint32 lo = Id->lo(id);
+ uint32 hi = Id->hi(id);
+
+ // Low points are not allowed to be equal to high points.
+ if ((lo <= Ii->lo(ii)) && (Ii->lo(ii) < hi)) {
+ beg = Id->de(id);
+ } else {
+ fprintf(stderr, "failed to find begin "uint32FMT" "uint32FMT" -- "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id), Id->de(id));
+ if (id > 0)
+ fprintf(stderr, " "uint32FMT" "uint32FMT" -- "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ Ii->lo(ii), Ii->hi(ii), Id->lo(id-1), Id->hi(id-1), Id->de(id-1));
+ //exit(1);
+ }
+ }
+
+ //fprintf(stderr, "testing end "uint32FMT" "uint32FMT" -- "uint32FMT" "uint32FMT"\n",
+ // Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id));
+
+ // High points can be equal.
+ while ((id < Id->numberOfIntervals()) &&
+ (Id->hi(id) < Ii->hi(ii))) {
+ id++;
+ //fprintf(stderr, "testing end (m) "uint32FMT" "uint32FMT" -- "uint32FMT" "uint32FMT"\n",
+ // Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id));
+ }
+ if (id < Id->numberOfIntervals()) {
+ uint32 lo = Id->lo(id);
+ uint32 hi = Id->hi(id);
+
+ // High points aren't allowed to be equal to lo, but can be equal to hi.
+ if ((lo < Ii->hi(ii)) && (Ii->hi(ii) <= hi)) {
+ end = Id->de(id);
+ } else {
+ fprintf(stderr, "failed to find end "uint32FMT" "uint32FMT" -- "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ Ii->lo(ii), Ii->hi(ii), Id->lo(id), Id->hi(id), Id->de(id));
+ if (id > 0)
+ fprintf(stderr, " "uint32FMT" "uint32FMT" -- "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ Ii->lo(ii), Ii->hi(ii), Id->lo(id-1), Id->hi(id-1), Id->de(id-1));
+ //exit(1);
+ }
+ }
+
+ badBegDepth[beg]++;
+ badEndDepth[end]++;
+
+ fprintf(stdout, uint64FMT"\t"uint32FMT"\t"uint32FMT"\tdepth="uint32FMT","uint32FMT"\n",
+ uid, Ii->lo(ii), Ii->hi(ii), beg, end);
+
+ if ((beg < 32) && (end < 32))
+ badDepth[beg][end]++;
+
+ ii++;
+ }
+ }
+
+ it++;
+ }
+
+ uint32 bb = 0;
+ uint32 be = 0;
+ for (uint32 x=0; x<32; x++) {
+ fprintf(stdout, uint32FMT"\t"uint32FMT"\t"uint32FMT"\n", x, badBegDepth[x], badEndDepth[x]);
+ bb += badBegDepth[x];
+ be += badEndDepth[x];
+ }
+ fprintf(stdout, "total\t"uint32FMT"\t"uint32FMT"\n", bb, be);
+
+ for (uint32 i=0; i<30; i++) {
+ for (uint32 j=0; j<30; j++)
+ fprintf(stdout, uint32FMTW(5), badDepth[i][j]);
+ fprintf(stdout, "\n");
+ }
+
+ return(0);
+}
diff --git a/meryl/asmMerQC.C b/meryl/asmMerQC.C
new file mode 100644
index 0000000..35bdfbb
--- /dev/null
+++ b/meryl/asmMerQC.C
@@ -0,0 +1,396 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libmeryl.H"
+
+// The categories depend on the type of input (fragments or contigs):
+//
+// 0 -- no count, mer not present
+// 1 -- single copy
+// 2 -- 2 -> 10 copies (contigs) -- 2 -> 2mode copies (frags)
+// 3 -- 11 -> 100 copies (contigs) -- -> 10mode copies (frags)
+// 4 -- 101+ copies (contigs) -- -> 100mode copies (frags)
+// 5 -- -> infinity copies (frags)
+
+// You'll also need to modify compare() and output() if you change this.
+#define NUMCATEGORIES 6
+
+// The output files are global for convenience. Otherwise, we'd be passing
+// them to compare() for every single mer.
+//
+bool dumpFlag = false;
+FILE *dumpSCZF = 0L;
+FILE *dumpMCZF = 0L;
+FILE *dumpMCSF = 0L;
+FILE *dumpMCMF = 0L;
+char merstring[1024];
+
+uint32
+findMode(char *name) {
+ merylStreamReader *M = new merylStreamReader(name);
+ uint32 *H = new uint32 [16384];
+
+ fprintf(stderr, "Finding mode of '%s'\n", name);
+
+ for (uint32 i=0; i<16384; i++)
+ H[i] = 0;
+
+ while (M->validMer()) {
+ if (M->theCount() < 16384)
+ H[M->theCount()]++;
+ M->nextMer();
+ }
+
+ uint32 mi = 2;
+ for (uint32 i=2; i<16384; i++)
+ if (H[i] > H[mi])
+ mi = i;
+
+ fprintf(stderr, "Mode of '%s' is "uint32FMT"\n", name, mi);
+
+ return(mi);
+}
+
+
+void
+compare(merylStreamReader *F,
+ merylStreamReader *C,
+ kMer &minmer,
+ uint32 mode,
+ uint32 R[NUMCATEGORIES][NUMCATEGORIES]) {
+ uint32 Ftype = 0;
+ uint32 Ctype = 0;
+ kMer Fmer = F->theFMer();
+ kMer Cmer = C->theFMer();
+ uint32 Fcnt = F->theCount();
+ uint32 Ccnt = C->theCount();
+
+ if (Fcnt == 0)
+ Ftype = 0;
+ else if (Fcnt == 1)
+ Ftype = 1;
+ else if (Fcnt <= 2*mode)
+ Ftype = 2;
+ else if (Fcnt <= 10*mode)
+ Ftype = 3;
+ else if (Fcnt <= 100*mode)
+ Ftype = 4;
+ else
+ Ftype = 5;
+
+ if (Ccnt == 0)
+ Ctype = 0;
+ else if (Ccnt == 1)
+ Ctype = 1;
+ else if (Ccnt <= 10)
+ Ctype = 2;
+ else if (Ccnt <= 100)
+ Ctype = 3;
+ else
+ Ctype = 4;
+
+ // If the mer isn't valid, we hit the end of the file, and the mer
+ // thus (obviously) isn't in the file.
+ //
+ if (F->validMer() == false)
+ Ftype = 0;
+ if (C->validMer() == false)
+ Ctype = 0;
+
+ // If either type is 0, we're done, but only increment the count if
+ // this mer is the minmer.
+ //
+ if ((Ftype == 0) || (Ctype == 0)) {
+ if (((Ftype == 0) && (Cmer == minmer)) ||
+ ((Ctype == 0) && (Fmer == minmer))) {
+ R[Ftype][Ctype]++;
+
+ // Save the mer if it's in contigs, but not fragments.
+ if (dumpFlag)
+ if (Ftype == 0)
+ if (Ctype == 1)
+ fprintf(dumpSCZF, ">"uint32FMT"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ else
+ fprintf(dumpMCZF, ">"uint32FMT"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ }
+ return;
+ }
+
+ // If the mers don't agree, we're also done. If either is the
+ // minmer, note that we saw it.
+ //
+ if (Fmer != Cmer) {
+ if (Fmer == minmer)
+ R[Ftype][0]++;
+ if (Cmer == minmer) {
+ R[0][Ctype]++;
+
+ // Again, save the mer since it's in contigs, but not fragments.
+ if (dumpFlag)
+ if (Ctype == 1)
+ fprintf(dumpSCZF, ">"uint32FMT"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ else
+ fprintf(dumpMCZF, ">"uint32FMT"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ }
+
+ return;
+ }
+
+ // If we're not the minmer, we're done.
+ if (Fmer != minmer)
+ return;
+
+ // Otherwise, the mers are in both inputs
+ R[Ftype][Ctype]++;
+
+ // Save the mer if it's in contigs "more" than if in fragments.
+ if (dumpFlag) {
+ if (Ftype < Ctype)
+ if (Ctype == 2)
+ fprintf(dumpMCSF, ">"uint32FMT"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ else
+ fprintf(dumpMCMF, ">"uint32FMT"\n%s\n", Ccnt, Cmer.merToString(merstring));
+
+ if ((Ftype == 0) && (Ctype == 1))
+ fprintf(dumpSCZF, ">"uint32FMT"\n%s\n", Ccnt, Cmer.merToString(merstring));
+ }
+}
+
+
+void
+output(char *title,
+ uint32 mode,
+ uint32 R[NUMCATEGORIES][NUMCATEGORIES]) {
+
+ fprintf(stdout, "\n\n%s\n", title);
+ fprintf(stdout, "(frags) | zero | one | <= 10 | <= 100 | <= inf | (contigs)\n");
+
+ for (uint32 i=0; i<6; i++) {
+ switch (i) {
+ case 0: fprintf(stdout, "zero "); break;
+ case 1: fprintf(stdout, "one "); break;
+ case 2: fprintf(stdout, "<= 2mode "); break;
+ case 3: fprintf(stdout, "<= 10mode "); break;
+ case 4: fprintf(stdout, "<= 100mode "); break;
+ case 5: fprintf(stdout, "<= inf "); break;
+ default: fprintf(stdout, "????????? "); break;
+ }
+ for (uint32 j=0; j<5; j++)
+ fprintf(stdout, uint32FMTW(12), R[i][j]);
+ fprintf(stdout, "\n");
+ }
+}
+
+
+
+
+int
+main(int argc, char **argv) {
+ merylStreamReader *AF = 0L;
+ merylStreamReader *TF = 0L;
+ merylStreamReader *AC = 0L;
+ merylStreamReader *DC = 0L;
+ merylStreamReader *CO = 0L;
+
+ uint32 AFmode = 0;
+ uint32 TFmode = 0;
+
+ char dumpSCZFname[1024] = {0}; // single contig, zero frags
+ char dumpMCZFname[1024] = {0}; // low contig, zero frags
+ char dumpMCSFname[1024] = {0}; // medium contig, low frags
+ char dumpMCMFname[1024] = {0}; // everything else, contig > frags
+
+ bool beVerbose = false;
+
+ //fprintf(stderr, "using cached modes for testing!\n");
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-af") == 0) { // All frags
+ ++arg;
+ AFmode = findMode(argv[arg]);
+ //AFmode = 8;
+ AF = new merylStreamReader(argv[arg]);
+ AF->nextMer();
+ } else if (strcmp(argv[arg], "-tf") == 0) { // Trimmed frags
+ ++arg;
+ TFmode = findMode(argv[arg]);
+ //TFmode = 8;
+ TF = new merylStreamReader(argv[arg]);
+ TF->nextMer();
+ } else if (strcmp(argv[arg], "-ac") == 0) { // All contigs
+ AC = new merylStreamReader(argv[++arg]);
+ AC->nextMer();
+ } else if (strcmp(argv[arg], "-dc") == 0) { // Degenerate contigs
+ DC = new merylStreamReader(argv[++arg]);
+ DC->nextMer();
+ } else if (strcmp(argv[arg], "-co") == 0) { // Contigs
+ CO = new merylStreamReader(argv[++arg]);
+ CO->nextMer();
+ } else if (strcmp(argv[arg], "-dump") == 0) {
+ arg++;
+ dumpFlag = true;
+ sprintf(dumpSCZFname, "%s.0.singlecontig.zerofrag.fasta", argv[arg]);
+ sprintf(dumpMCZFname, "%s.1.multiplecontig.zerofrag.fasta", argv[arg]);
+ sprintf(dumpMCSFname, "%s.2.multiplecontig.lowfrag.fasta", argv[arg]);
+ sprintf(dumpMCMFname, "%s.3.multiplecontig.multiplefrag.fasta", argv[arg]);
+ } else if (strcmp(argv[arg], "-v") == 0) {
+ beVerbose = true;
+ } else {
+ fprintf(stderr, "unknown option '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ if ((AF == 0L) && (TF == 0L) && (AC == 0L) && (DC == 0L) && (CO == 0L)) {
+ fprintf(stderr, "usage: %s [opts] [-v] [-dump prefix]\n", argv[0]);
+ fprintf(stderr, "At least one fragcounts and one contigcounts are needed.\n");
+ fprintf(stderr, " -af | -tf fragcounts\n");
+ fprintf(stderr, " -ac | -dc | -co contigcounts \n");
+ fprintf(stderr, "Dumping is probably only useful with exactly one frag and\n");
+ fprintf(stderr, "one contig, but I'll let you do it with any number.\n");
+ exit(1);
+ }
+ if ((AF == 0L) && (TF == 0L)) {
+ fprintf(stderr, "ERROR - need at least one of -af, -tf\n");
+ exit(1);
+ }
+ if ((AC == 0L) && (DC == 0L) && (CO == 0L)) {
+ fprintf(stderr, "ERROR - need at least one of -ac, -dc, -co\n");
+ exit(1);
+ }
+
+ // Check mersizes.
+ //
+ uint32 merSize = 0;
+ uint32 ms[5] = { 0 };
+
+ if (AF) merSize = ms[0] = AF->merSize();
+ if (TF) merSize = ms[1] = TF->merSize();
+ if (AC) merSize = ms[2] = AC->merSize();
+ if (DC) merSize = ms[3] = DC->merSize();
+ if (CO) merSize = ms[4] = CO->merSize();
+
+ bool differ = false;
+
+ if ((ms[0] > 0) && (ms[0] != merSize)) differ = true;
+ if ((ms[1] > 0) && (ms[1] != merSize)) differ = true;
+ if ((ms[2] > 0) && (ms[2] != merSize)) differ = true;
+ if ((ms[3] > 0) && (ms[3] != merSize)) differ = true;
+ if ((ms[4] > 0) && (ms[4] != merSize)) differ = true;
+
+ if (differ) {
+ fprintf(stderr, "error: mer size differ.\n");
+ fprintf(stderr, " AF - "uint32FMT"\n", ms[0]);
+ fprintf(stderr, " TF - "uint32FMT"\n", ms[1]);
+ fprintf(stderr, " AC - "uint32FMT"\n", ms[2]);
+ fprintf(stderr, " DC - "uint32FMT"\n", ms[3]);
+ fprintf(stderr, " CO - "uint32FMT"\n", ms[4]);
+ exit(1);
+ }
+
+ if (dumpFlag) {
+ errno = 0;
+ dumpSCZF = fopen(dumpSCZFname, "w");
+ dumpMCZF = fopen(dumpMCZFname, "w");
+ dumpMCSF = fopen(dumpMCSFname, "w");
+ dumpMCMF = fopen(dumpMCMFname, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open the dump files: %s\n", strerror(errno)), exit(1);
+ }
+
+ uint32 AFvsAC[NUMCATEGORIES][NUMCATEGORIES];
+ uint32 AFvsDC[NUMCATEGORIES][NUMCATEGORIES];
+ uint32 AFvsCO[NUMCATEGORIES][NUMCATEGORIES];
+ uint32 TFvsAC[NUMCATEGORIES][NUMCATEGORIES];
+ uint32 TFvsDC[NUMCATEGORIES][NUMCATEGORIES];
+ uint32 TFvsCO[NUMCATEGORIES][NUMCATEGORIES];
+ for (uint32 i=0; i<NUMCATEGORIES; i++)
+ for (uint32 j=0; j<NUMCATEGORIES; j++) {
+ AFvsAC[i][j] = 0;
+ AFvsDC[i][j] = 0;
+ AFvsCO[i][j] = 0;
+ TFvsAC[i][j] = 0;
+ TFvsDC[i][j] = 0;
+ TFvsCO[i][j] = 0;
+ }
+
+ // The default constructor for kMer sets the mer to size 0, all A.
+ // We need it to be the proper size, and all T.
+ kMer minmer(merSize);
+
+ // Don't care what we pick, as long as it's a mer in the set.
+ //
+ if (AF && AF->validMer()) minmer = AF->theFMer();
+ if (TF && TF->validMer()) minmer = TF->theFMer();
+ if (AC && AC->validMer()) minmer = AC->theFMer();
+ if (DC && DC->validMer()) minmer = DC->theFMer();
+ if (CO && CO->validMer()) minmer = CO->theFMer();
+
+ speedCounter *C = new speedCounter(" Examining: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, beVerbose);
+
+ bool morestuff = true;
+ while (morestuff) {
+
+ // Find any mer in our set
+ if (AF && AF->validMer()) minmer = AF->theFMer();
+ if (TF && TF->validMer()) minmer = TF->theFMer();
+ if (AC && AC->validMer()) minmer = AC->theFMer();
+ if (DC && DC->validMer()) minmer = DC->theFMer();
+ if (CO && CO->validMer()) minmer = CO->theFMer();
+
+ // Find the smallest mer in our set
+ if (AF && AF->validMer() && (AF->theFMer() < minmer)) minmer = AF->theFMer();
+ if (TF && TF->validMer() && (TF->theFMer() < minmer)) minmer = TF->theFMer();
+ if (AC && AC->validMer() && (AC->theFMer() < minmer)) minmer = AC->theFMer();
+ if (DC && DC->validMer() && (DC->theFMer() < minmer)) minmer = DC->theFMer();
+ if (CO && CO->validMer() && (CO->theFMer() < minmer)) minmer = CO->theFMer();
+
+ // We need to do up to six comparisons here.
+ if (AF && AC) compare(AF, AC, minmer, AFmode, AFvsAC);
+ if (AF && DC) compare(AF, DC, minmer, AFmode, AFvsDC);
+ if (AF && CO) compare(AF, CO, minmer, AFmode, AFvsCO);
+ if (TF && AC) compare(TF, AC, minmer, TFmode, TFvsAC);
+ if (TF && DC) compare(TF, DC, minmer, TFmode, TFvsDC);
+ if (TF && CO) compare(TF, CO, minmer, TFmode, TFvsCO);
+
+ C->tick();
+#if 0
+ if (C->tick()) {
+ char stringjunk[256];
+ fprintf(stderr, "\nMM %s\n", minmer.merToString(stringjunk));
+ if (AF) fprintf(stderr, "AF %s\n", AF->theFMer().merToString(stringjunk));
+ if (TF) fprintf(stderr, "TF %s\n", TF->theFMer().merToString(stringjunk));
+ if (AC) fprintf(stderr, "AC %s\n", AC->theFMer().merToString(stringjunk));
+ if (DC) fprintf(stderr, "DC %s\n", DC->theFMer().merToString(stringjunk));
+ if (CO) fprintf(stderr, "CO %s\n", CO->theFMer().merToString(stringjunk));
+ }
+#endif
+
+ // Advance to the next mer, if we were just used
+ morestuff = false;
+ if ((AF) && (AF->theFMer() == minmer)) morestuff |= AF->nextMer();
+ if ((TF) && (TF->theFMer() == minmer)) morestuff |= TF->nextMer();
+ if ((AC) && (AC->theFMer() == minmer)) morestuff |= AC->nextMer();
+ if ((DC) && (DC->theFMer() == minmer)) morestuff |= DC->nextMer();
+ if ((CO) && (CO->theFMer() == minmer)) morestuff |= CO->nextMer();
+ }
+
+ delete C;
+
+ // output
+
+ if ((AF) && (AC)) output("all frags vs all contigs", AFmode, AFvsAC);
+ if ((AF) && (DC)) output("all frags vs deg. contigs", AFmode, AFvsDC);
+ if ((AF) && (CO)) output("all frags vs non-deg. contigs", AFmode, AFvsCO);
+ if ((TF) && (AC)) output("trimmed frags vs all contigs", TFmode, TFvsAC);
+ if ((TF) && (DC)) output("trimmed frags vs deg. contigs", TFmode, TFvsDC);
+ if ((TF) && (CO)) output("trimmed frags vs non-deg. contigs", TFmode, TFvsCO);
+
+ delete AF;
+ delete TF;
+ delete AC;
+ delete DC;
+ delete CO;
+}
diff --git a/meryl/asmMerQC.sh b/meryl/asmMerQC.sh
new file mode 100644
index 0000000..584fb39
--- /dev/null
+++ b/meryl/asmMerQC.sh
@@ -0,0 +1,166 @@
+#!/bin/sh
+
+# Test if the mers in the consensus sequence are supported by mers in
+# the fragments.
+
+# If we count just the clear, we get a clearer (ha, ha) picture of
+# the assembly quality, while if we count all reads we get a picture
+# of trimming.
+#
+onlyClear=1
+onlyReal=1
+
+mem=8192
+mem=16384
+mem=24576
+
+ms=22
+
+binroot=/bioinfo/assembly/walenz/src/genomics
+asmMerQC=$binroot/meryl/asmMerQC
+mapMers=$binroot/meryl/mapMers
+
+dir=/scratch/drosnightly
+asm=willi
+
+dir=/project/huref6/redo-consensus_gennady
+asm=h6tmp
+
+dir=/project/huref6/assembly/h6
+asm=h6
+
+
+# Count mers in reads
+#
+if [ ! -e $asm-ms$ms-clr-frags.mcidx ] ; then
+ bin/dumpFragStoreAsFasta -frg $dir/$asm.frgStore | \
+ meryl -B -C -m $ms -s - -o $asm-ms$ms-clr-frags -threads 4 -memory $mem -v
+fi
+
+if [ ! -e $asm-ms$ms-all-frags.mcidx ] ; then
+ bin/dumpFragStoreAsFasta -allbases -allfrags -frg $dir/$asm.frgStore | \
+ meryl -B -C -m $ms -s - -o $asm-ms$ms-all-frags -threads 4 -memory $mem -v
+fi
+
+echo Finding contigs.
+
+if [ ! -e $asm.normalcontigs.fasta ] ; then
+ bin/asmOutputContigsFasta < $dir/9-terminator/$asm.asm > $asm.normalcontigs.fasta &
+fi
+if [ ! -e $asm.degeneratecontigs.fasta ] ; then
+ bin/asmOutputContigsFasta -D < $dir/9-terminator/$asm.asm > $asm.degeneratecontigs.fasta &
+fi
+if [ ! -e $asm.allcontigs.fasta ] ; then
+ bin/asmOutputContigsFasta -d < $dir/9-terminator/$asm.asm > $asm.allcontigs.fasta &
+fi
+
+# Count mers in contigs
+#
+if [ ! -e $asm-ms$ms-normal-contigs.mcidx ] ; then
+ meryl -B -C -m $ms -s $asm.normalcontigs.fasta -o $asm-ms$ms-normal-contigs -threads 4 -segments 4 -v &
+fi
+if [ ! -e $asm-ms$ms-degenerate-contigs.mcidx ] ; then
+ meryl -B -C -m $ms -s $asm.degeneratecontigs.fasta -o $asm-ms$ms-degenerate-contigs -threads 4 -segments 4 -v &
+fi
+if [ ! -e $asm-ms$ms-all-contigs.mcidx ] ; then
+ meryl -B -C -m $ms -s $asm.allcontigs.fasta -o $asm-ms$ms-all-contigs -threads 4 -segments 4 -v &
+fi
+
+if [ ! -e $asm-ms$ms.asmMerQC ] ; then
+ $asmMerQC -af $asm-ms$ms-all-frags \
+ -tf $asm-ms$ms-clr-frags \
+ -co $asm-ms$ms-normal-contigs \
+ -ac $asm-ms$ms-all-contigs \
+ -dc $asm-ms$ms-degenerate-contigs \
+ > $asm-ms$ms.asmMerQC &
+fi
+
+echo Finding badmers.
+
+if [ ! -e $asm-ms$ms-allfrags-normalcontigs.badmers.asmMerQC ] ; then
+ $asmMerQC -af $asm-ms$ms-all-frags \
+ -co $asm-ms$ms-normal-contigs \
+ -dump $asm-ms$ms-allfrags-normalcontigs.badmers \
+ > $asm-ms$ms-allfrags-normalcontigs.badmers.asmMerQC &
+fi
+if [ ! -e $asm-ms$ms-allfrags-allcontigs.badmers.asmMerQC ] ; then
+ $asmMerQC -af $asm-ms$ms-all-frags \
+ -ac $asm-ms$ms-all-contigs \
+ -dump $asm-ms$ms-allfrags-allcontigs.badmers \
+ > $asm-ms$ms-allfrags-allcontigs.badmers.asmMerQC &
+fi
+if [ ! -e $asm-ms$ms-allfrags-degeneratecontigs.badmers.asmMerQC ] ; then
+ $asmMerQC -af $asm-ms$ms-all-frags \
+ -dc $asm-ms$ms-degenerate-contigs \
+ -dump $asm-ms$ms-allfrags-degeneratecontigs.badmers \
+ > $asm-ms$ms-allfrags-degeneratecontigs.badmers.asmMerQC &
+fi
+
+if [ ! -e $asm-ms$ms-clrfrags-normalcontigs.badmers.asmMerQC ] ; then
+ $asmMerQC -tf $asm-ms$ms-clr-frags \
+ -co $asm-ms$ms-normal-contigs \
+ -dump $asm-ms$ms-clrfrags-normalcontigs.badmers \
+ > $asm-ms$ms-clrfrags-normalcontigs.badmers.asmMerQC &
+fi
+if [ ! -e $asm-ms$ms-clrfrags-allcontigs.badmers.asmMerQC ] ; then
+ $asmMerQC -tf $asm-ms$ms-clr-frags \
+ -ac $asm-ms$ms-all-contigs \
+ -dump $asm-ms$ms-clrfrags-allcontigs.badmers \
+ > $asm-ms$ms-clrfrags-allcontigs.badmers.asmMerQC &
+fi
+if [ ! -e $asm-ms$ms-clrfrags-degeneratecontigs.badmers.asmMerQC ] ; then
+ $asmMerQC -tf $asm-ms$ms-clr-frags \
+ -dc $asm-ms$ms-degenerate-contigs \
+ -dump $asm-ms$ms-clrfrags-degeneratecontigs.badmers \
+ > $asm-ms$ms-clrfrags-degeneratecontigs.badmers.asmMerQC &
+fi
+
+echo Mapping.
+
+if [ ! -e $asm-ms$ms-allfrags-normalcontigs.badmers.0.singlecontig.zerofrag.badmers ] ; then
+ $mapMers -m 22 \
+ -mers $asm-ms$ms-allfrags-normalcontigs.badmers.0.singlecontig.zerofrag.fasta \
+ -seq $asm.normalcontigs.fasta \
+ > $asm-ms$ms-allfrags-normalcontigs.badmers.0.singlecontig.zerofrag.badmers &
+fi
+if [ ! -e $asm-ms$ms-allfrags-allcontigs.badmers.0.singlecontig.zerofrag.badmers ] ; then
+ $mapMers -m 22 \
+ -mers $asm-ms$ms-allfrags-allcontigs.badmers.0.singlecontig.zerofrag.fasta \
+ -seq $asm.allcontigs.fasta \
+ > $asm-ms$ms-allfrags-allcontigs.badmers.0.singlecontig.zerofrag.badmers &
+fi
+if [ ! -e $asm-ms$ms-allfrags-degeneratecontigs.badmers.0.singlecontig.zerofrag.badmers ] ; then
+ $mapMers -m 22 \
+ -mers $asm-ms$ms-allfrags-degeneratecontigs.badmers.0.singlecontig.zerofrag.fasta \
+ -seq $asm.degeneratecontigs.fasta \
+ > $asm-ms$ms-allfrags-degeneratecontigs.badmers.0.singlecontig.zerofrag.badmers &
+fi
+
+if [ ! -e $asm-ms$ms-clrfrags-normalcontigs.badmers.0.singlecontig.zerofrag.badmers ] ; then
+ $mapMers -m 22 \
+ -mers $asm-ms$ms-clrfrags-normalcontigs.badmers.0.singlecontig.zerofrag.fasta \
+ -seq $asm.normalcontigs.fasta \
+ > $asm-ms$ms-clrfrags-normalcontigs.badmers.0.singlecontig.zerofrag.badmers &
+fi
+if [ ! -e $asm-ms$ms-clrfrags-allcontigs.badmers.0.singlecontig.zerofrag.badmers ] ; then
+ $mapMers -m 22 \
+ -mers $asm-ms$ms-clrfrags-allcontigs.badmers.0.singlecontig.zerofrag.fasta \
+ -seq $asm.allcontigs.fasta \
+ > $asm-ms$ms-clrfrags-allcontigs.badmers.0.singlecontig.zerofrag.badmers &
+fi
+if [ ! -e $asm-ms$ms-clrfrags-degeneratecontigs.badmers.0.singlecontig.zerofrag.badmers ] ; then
+ $mapMers -m 22 \
+ -mers $asm-ms$ms-clrfrags-degeneratecontigs.badmers.0.singlecontig.zerofrag.fasta \
+ -seq $asm.degeneratecontigs.fasta \
+ > $asm-ms$ms-clrfrags-degeneratecontigs.badmers.0.singlecontig.zerofrag.badmers &
+fi
+
+if [ ! -e $asm-ms$ms-allfrags-normalcontigs.badmers.5.all.badmers ] ; then
+ cat $asm-ms$ms-allfrags-normalcontigs.badmers.[01].*.fasta > $asm-ms$ms-allfrags-normalcontigs.badmers.5.allzero.fasta
+ $mapMers -m 22 \
+ -mers $asm-ms$ms-allfrags-normalcontigs.badmers.5.allzero.fasta \
+ -seq $asm.normalcontigs.fasta \
+ > $asm-ms$ms-allfrags-normalcontigs.badmers.5.allzero.badmers &
+fi
+
+date
diff --git a/meryl/binaryOp.C b/meryl/binaryOp.C
new file mode 100644
index 0000000..0cb8fc7
--- /dev/null
+++ b/meryl/binaryOp.C
@@ -0,0 +1,176 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "meryl.H"
+#include "libmeryl.H"
+
+
+void
+binaryOperations(merylArgs *args) {
+
+ if (args->mergeFilesLen != 2) {
+ fprintf(stderr, "ERROR - must have exactly two files!\n");
+ exit(1);
+ }
+ if (args->outputFile == 0L) {
+ fprintf(stderr, "ERROR - no output file specified.\n");
+ exit(1);
+ }
+ if ((args->personality != PERSONALITY_SUB) &&
+ (args->personality != PERSONALITY_ABS) &&
+ (args->personality != PERSONALITY_DIVIDE)) {
+ fprintf(stderr, "ERROR - only personalities sub and abs\n");
+ fprintf(stderr, "ERROR - are supported in binaryOperations().\n");
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ }
+
+ // Open the input files, read in the first mer
+ //
+ merylStreamReader *A = new merylStreamReader(args->mergeFiles[0]);
+ merylStreamReader *B = new merylStreamReader(args->mergeFiles[1]);
+
+ A->nextMer();
+ B->nextMer();
+
+ // Make sure that the mersizes agree, and pick a prefix size for
+ // the output
+ //
+ if (A->merSize() != B->merSize()) {
+ fprintf(stderr, "ERROR - mersizes are different!\n");
+ fprintf(stderr, "ERROR - mersize of '%s' is "uint32FMT"\n", args->mergeFiles[0], A->merSize());
+ fprintf(stderr, "ERROR - mersize of '%s' is "uint32FMT"\n", args->mergeFiles[1], B->merSize());
+ exit(1);
+ }
+
+ // Open the output file, using the larger of the two prefix sizes
+ //
+ merylStreamWriter *W = new merylStreamWriter(args->outputFile,
+ A->merSize(),
+ A->merCompression(),
+ (A->prefixSize() > B->prefixSize()) ? A->prefixSize() : B->prefixSize(),
+ A->hasPositions());
+
+
+
+
+ // SUB - report A - B
+ // ABS - report the absolute difference between the two files
+ //
+ // These two operations are very similar (SUB was derived from ABS), so
+ // any bug found in one is probably in the other.
+ //
+ kMer Amer;
+ uint32 Acnt = uint32ZERO;
+ kMer Bmer;
+ uint32 Bcnt = uint32ZERO;
+
+ switch (args->personality) {
+ case PERSONALITY_SUB:
+ while (A->validMer() || B->validMer()) {
+ Amer = A->theFMer();
+ Acnt = A->theCount();
+ Bmer = B->theFMer();
+ Bcnt = B->theCount();
+
+ // If the A stream is all out of mers, set Amer to be the
+ // same as Bmer, and set Acnt to zero. Similar for B.
+ //
+ if (!A->validMer()) {
+ Amer = Bmer;
+ Acnt = uint32ZERO;
+ }
+ if (!B->validMer()) {
+ Bmer = Amer;
+ Bcnt = uint32ZERO;
+ }
+
+ //fprintf(stderr, "sub A="uint64HEX" B="uint64HEX"\n", Amer, Bmer);
+
+ if (Amer == Bmer) {
+ W->addMer(Amer, (Acnt > Bcnt) ? Acnt - Bcnt : 0);
+ A->nextMer();
+ B->nextMer();
+ } else if (Amer < Bmer) {
+ W->addMer(Amer, Acnt);
+ A->nextMer();
+ } else {
+ B->nextMer();
+ }
+ }
+ break;
+ case PERSONALITY_ABS:
+ while (A->validMer() || B->validMer()) {
+ Amer = A->theFMer();
+ Acnt = A->theCount();
+ Bmer = B->theFMer();
+ Bcnt = B->theCount();
+
+ // If the A stream is all out of mers, set Amer to be the
+ // same as Bmer, and set Acnt to zero. Similar for B.
+ //
+ if (!A->validMer()) {
+ Amer = Bmer;
+ Acnt = uint32ZERO;
+ }
+ if (!B->validMer()) {
+ Bmer = Amer;
+ Bcnt = uint32ZERO;
+ }
+
+ if (Amer == Bmer) {
+ W->addMer(Amer, (Acnt > Bcnt) ? Acnt - Bcnt : Bcnt - Acnt);
+ A->nextMer();
+ B->nextMer();
+ } else if (Amer < Bmer) {
+ W->addMer(Amer, Acnt);
+ A->nextMer();
+ } else {
+ W->addMer(Bmer, Bcnt);
+ B->nextMer();
+ }
+ }
+ break;
+ case PERSONALITY_DIVIDE:
+ while (A->validMer() || B->validMer()) {
+ Amer = A->theFMer();
+ Acnt = A->theCount();
+ Bmer = B->theFMer();
+ Bcnt = B->theCount();
+
+ // If the A stream is all out of mers, set Amer to be the
+ // same as Bmer, and set Acnt to zero. Similar for B.
+ //
+ if (!A->validMer()) {
+ Amer = Bmer;
+ Acnt = uint32ZERO;
+ }
+ if (!B->validMer()) {
+ Bmer = Amer;
+ Bcnt = uint32ZERO;
+ }
+
+ if (Amer == Bmer) {
+ if ((Acnt > 0) && (Bcnt > 0)) {
+ double d = 1000.0 * (double)Acnt / (double)Bcnt;
+ if (d > 4096.0 * 1024.0 * 1024.0)
+ d = 4096.0 * 1024.0 * 1024.0;
+ W->addMer(Amer, (uint32)floor(d));
+ }
+ A->nextMer();
+ B->nextMer();
+ } else if (Amer < Bmer) {
+ A->nextMer();
+ } else {
+ B->nextMer();
+ }
+ }
+ break;
+ }
+
+ delete A;
+ delete B;
+ delete W;
+}
diff --git a/meryl/build-threads.C b/meryl/build-threads.C
new file mode 100644
index 0000000..bc78b41
--- /dev/null
+++ b/meryl/build-threads.C
@@ -0,0 +1,94 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <pthread.h>
+
+#include "bio++.H"
+#include "meryl.H"
+#include "libmeryl.H"
+
+void
+runSegment(merylArgs *args, uint64 segment);
+
+pthread_mutex_t segmentMutex;
+uint64 segmentNext;
+uint64 segmentMax;
+uint32 *segmentDone;
+
+
+void*
+buildThread(void *U) {
+ uint64 segment = uint32ZERO;
+ merylArgs *args = (merylArgs *)U;
+
+ while (segment < segmentMax) {
+ pthread_mutex_lock(&segmentMutex);
+ segment = segmentNext++;
+ pthread_mutex_unlock(&segmentMutex);
+
+ if (segment < segmentMax) {
+ runSegment(args, segment);
+ segmentDone[segment]++;
+ }
+ }
+
+ if (args->beVerbose)
+ fprintf(stderr, "Thread exits.\n");
+
+ return(0L);
+}
+
+
+void
+runThreaded(merylArgs *args) {
+
+ // Clear stuff
+ //
+ segmentNext = uint64ZERO;
+ segmentMax = args->segmentLimit;
+ segmentDone = new uint32 [segmentMax];
+ for (uint64 s=0; s<segmentMax; s++)
+ segmentDone[s] = uint32ZERO;
+
+ // Initialize threads
+ //
+ pthread_attr_t threadAttr;
+ pthread_t threadID;
+
+ pthread_mutex_init(&segmentMutex, NULL);
+
+ pthread_attr_init(&threadAttr);
+ pthread_attr_setscope(&threadAttr, PTHREAD_SCOPE_SYSTEM);
+ pthread_attr_setdetachstate(&threadAttr, PTHREAD_CREATE_DETACHED);
+ pthread_attr_setschedpolicy(&threadAttr, SCHED_OTHER);
+
+ // Start the threads
+ //
+ for (uint64 i=0; i<args->numThreads; i++)
+ pthread_create(&threadID, &threadAttr, buildThread, (void *)args);
+
+ // Wait for the threads to complete
+ //
+ struct timespec shortNap;
+ shortNap.tv_sec = 1;
+ shortNap.tv_nsec = 0;
+
+ uint64 s=0;
+ while (s < segmentMax) {
+ if (segmentDone[s] == 0)
+ nanosleep(&shortNap, 0L);
+ else
+ s++;
+ }
+
+ if (args->beVerbose)
+ fprintf(stderr, "Threads all done, cleaning up.\n");
+
+ // Cleanup
+ //
+ pthread_attr_destroy(&threadAttr);
+ pthread_mutex_destroy(&segmentMutex);
+
+ delete [] segmentDone;
+}
diff --git a/meryl/build.C b/meryl/build.C
new file mode 100644
index 0000000..70655b6
--- /dev/null
+++ b/meryl/build.C
@@ -0,0 +1,842 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <math.h>
+
+#include "bio++.H"
+#include "meryl.H"
+#include "libmeryl.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+void runThreaded(merylArgs *args);
+
+// You probably want this to be the same as KMER_WORDS, but in rare
+// cases, it can be less.
+//
+#define SORTED_LIST_WIDTH KMER_WORDS
+
+// to make the sorted list be wider, we also need to store wide
+// things in the bitpackedarray buckets. probably easy (do multiple
+// adds of data, each at most 64 bits) but not braindead.
+
+#if SORTED_LIST_WIDTH == 1
+
+class sortedList_t {
+public:
+ uint64 _w;
+ uint32 _p;
+
+ bool operator<(sortedList_t &that) {
+ return(_w < that._w);
+ };
+
+ bool operator>=(sortedList_t &that) {
+ return(_w >= that._w);
+ };
+
+ sortedList_t &operator=(sortedList_t &that) {
+ _w = that._w;
+ _p = that._p;
+ return(*this);
+ };
+};
+
+#else
+
+class sortedList_t {
+public:
+ uint64 _w[SORTED_LIST_WIDTH];
+ uint32 _p;
+
+ bool operator<(sortedList_t &that) {
+ for (uint32 i=SORTED_LIST_WIDTH; i--; ) {
+ if (_w[i] < that._w[i]) return(true);
+ if (_w[i] > that._w[i]) return(false);
+ }
+ return(false);
+ };
+
+ bool operator>=(sortedList_t &that) {
+ for (uint32 i=SORTED_LIST_WIDTH; i--; ) {
+ if (_w[i] > that._w[i]) return(true);
+ if (_w[i] < that._w[i]) return(false);
+ }
+ return(true);
+ };
+
+ sortedList_t &operator=(sortedList_t &that) {
+ for (uint32 i=SORTED_LIST_WIDTH; i--; )
+ _w[i] = that._w[i];
+ _p = that._p;
+ return(*this);
+ };
+};
+
+#endif
+
+
+
+
+void
+adjustHeap(sortedList_t *M, int64 i, int64 n) {
+ sortedList_t m = M[i];
+ int64 j = (i << 1) + 1; // let j be the left child
+
+ while (j < n) {
+ if (j<n-1 && M[j] < M[j+1])
+ j++; // j is the larger child
+
+ if (m >= M[j]) // a position for M[i] has been found
+ break;
+
+ M[(j-1)/2] = M[j]; // Move larger child up a level
+
+ j = (j << 1) + 1;
+ }
+
+ M[(j-1)/2] = m;
+}
+
+
+void
+submitPrepareBatch(merylArgs *args) {
+ FILE *F;
+ char nam[1024];
+ char cmd[1024];
+
+ sprintf(nam, "%s-prepare.sh", args->outputFile);
+
+ errno = 0;
+ F = fopen(nam, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", nam, strerror(errno)), exit(1);
+
+ fprintf(F, "#!/bin/sh\n\n");
+ fprintf(F, ". $SGE_ROOT/$SGE_CELL/common/settings.sh\n");
+ fprintf(F, "%s -forcebuild %s\n", args->execName, args->options);
+ fclose(F);
+
+ if (args->sgeMergeOpt)
+ sprintf(cmd, "qsub -cwd -b n -j y -o %s-prepare.err %s -N mp%s %s-prepare.sh",
+ args->outputFile, args->sgeMergeOpt, args->sgeJobName, args->outputFile);
+ else
+ sprintf(cmd, "qsub -cwd -b n -j y -o %s-prepare.err -N mp%s %s-prepare.sh",
+ args->outputFile, args->sgeJobName, args->outputFile);
+ fprintf(stderr, "%s\n", cmd);
+ if (system(cmd))
+ fprintf(stderr, "%s\nFailed to execute qsub command: %s\n", cmd, strerror(errno)), exit(1);
+}
+
+
+void
+submitCountBatches(merylArgs *args) {
+ FILE *F;
+ char nam[1024];
+ char cmd[1024];
+
+ sprintf(nam, "%s-count.sh", args->outputFile);
+
+ errno = 0;
+ F = fopen(nam, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", nam, strerror(errno)), exit(1);
+
+ fprintf(F, "#!/bin/sh\n\n");
+ fprintf(F, ". $SGE_ROOT/$SGE_CELL/common/settings.sh\n");
+ fprintf(F, "batchnum=`expr $SGE_TASK_ID - 1`\n");
+ fprintf(F, "%s -v -countbatch $batchnum -o %s\n", args->execName, args->outputFile);
+ fclose(F);
+
+ if (args->sgeBuildOpt)
+ sprintf(cmd, "qsub -t 1-"uint64FMT" -cwd -b n -j y -o %s-count-\\$TASK_ID.err %s -N mc%s %s-count.sh",
+ args->segmentLimit, args->outputFile, args->sgeBuildOpt, args->sgeJobName, args->outputFile);
+ else
+ sprintf(cmd, "qsub -t 1-"uint64FMT" -cwd -b n -j y -o %s-count-\\$TASK_ID.err -N mc%s %s-count.sh",
+ args->segmentLimit, args->outputFile, args->sgeJobName, args->outputFile);
+ fprintf(stderr, "%s\n", cmd);
+ if (system(cmd))
+ fprintf(stderr, "%s\nFailed to execute qsub command: %s\n", cmd, strerror(errno)), exit(1);
+
+ // submit the merge
+
+ sprintf(nam, "%s-merge.sh", args->outputFile);
+
+ errno = 0;
+ F = fopen(nam, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", nam, strerror(errno)), exit(1);
+
+ fprintf(F, "#!/bin/sh\n\n");
+ fprintf(F, ". $SGE_ROOT/$SGE_CELL/common/settings.sh\n");
+ fprintf(F, "%s -mergebatch -o %s\n", args->execName, args->outputFile);
+ fclose(F);
+
+ if (args->sgeMergeOpt)
+ sprintf(cmd, "qsub -hold_jid mc%s -cwd -b n -j y -o %s-merge.err %s -N mm%s %s-merge.sh",
+ args->sgeJobName, args->outputFile, args->sgeMergeOpt, args->sgeJobName, args->outputFile);
+ else
+ sprintf(cmd, "qsub -hold_jid mc%s -cwd -b n -j y -o %s-merge.err -N mm%s %s-merge.sh",
+ args->sgeJobName, args->outputFile, args->sgeJobName, args->outputFile);
+ fprintf(stderr, "%s\n", cmd);
+ if (system(cmd))
+ fprintf(stderr, "%s\nFailed to execute qsub command: %s\n", cmd, strerror(errno)), exit(1);
+}
+
+
+void
+prepareBatch(merylArgs *args) {
+ bool fatalError = false;
+
+ if (args->inputFile == 0L)
+ fprintf(stderr, "ERROR - no input file specified.\n"), fatalError = true;
+
+ if (args->outputFile == 0L)
+ fprintf(stderr, "ERROR - no output file specified.\n"), fatalError = true;
+
+ if ((args->doForward == false) &&
+ (args->doReverse == false) &&
+ (args->doCanonical == false))
+ fprintf(stderr, "ERROR - need to specify at least one of -f, -r, -C\n"), fatalError = true;
+
+ if ((args->doForward && args->doReverse) ||
+ (args->doForward && args->doCanonical) ||
+ (args->doReverse && args->doCanonical))
+ fprintf(stderr, "ERROR - only one of -f, -r and -C may be specified!\n"), fatalError = true;
+
+ if (args->lowCount > args->highCount)
+ fprintf(stderr, "ERROR - lowCount > highCount??\n"), fatalError = true;
+
+ if (args->segmentLimit && args->memoryLimit)
+ fprintf(stderr, "ERROR: Only one of -memory and -segments can be specified.\n"), fatalError=true;
+
+ if (fatalError)
+ exit(1);
+
+ if (args->numThreads > 0) {
+ // If we were given no segment or memory limit, but threads, we
+ // really want to create n segments.
+ //
+ if ((args->segmentLimit == 0) && (args->memoryLimit == 0)) {
+ args->segmentLimit = args->numThreads;
+ }
+
+ // If we are given a memory limit and threads, we want to use that much memory
+ // total, not per thread.
+ //
+ if ((args->memoryLimit > 0) && (args->numThreads > 0)) {
+ args->segmentLimit = 0;
+ args->memoryLimit /= args->numThreads;
+ }
+ }
+
+ {
+ seqStream *seqstr = new seqStream(args->inputFile);
+
+ args->numBasesActual = 0;
+ for (uint32 i=0; i<seqstr->numberOfSequences(); i++)
+ args->numBasesActual += seqstr->lengthOf(i);
+
+ merStream *merstr = new merStream(new kMerBuilder(args->merSize), seqstr, true, true);
+
+ args->numMersActual = merstr->approximateNumberOfMers() + 1;
+
+ delete merstr;
+ }
+
+#warning not submitting prepareBatch to grid
+#if 0
+ if ((args->isOnGrid) || (args->sgeJobName == 0L)) {
+ } else {
+
+ // Shucks, we need to build the merstream file. Lets do it
+ // on the grid!
+ //
+ submitPrepareBatch(args);
+ exit(0);
+ }
+#endif
+
+
+ // If there is a memory limit, ignore the total number of mers and
+ // pick a value that fits in memory.
+ //
+ // Otherwise, if there is a segment limit, split the total number
+ // of mers into n pieces. Remember, there cannot be both a
+ // memoryLimit and a segmentLimit.
+ //
+ // Otherwise, we must be doing it all in one fell swoop.
+ //
+ if (args->memoryLimit) {
+ args->mersPerBatch = estimateNumMersInMemorySize(args->merSize, args->memoryLimit, args->positionsEnabled, args->beVerbose);
+ if (args->mersPerBatch > args->numMersActual)
+ args->mersPerBatch = args->numMersActual;
+ args->segmentLimit = (uint64)ceil((double)args->numMersActual / (double)args->mersPerBatch);
+ if (args->beVerbose)
+ fprintf(stderr, "Have a memory limit: mersPerBatch="uint64FMT" segmentLimit="uint64FMT"\n", args->mersPerBatch, args->segmentLimit);
+ } else if (args->segmentLimit) {
+ args->mersPerBatch = (uint64)ceil((double)args->numMersActual / (double)args->segmentLimit);
+ if (args->beVerbose)
+ fprintf(stderr, "Have a segment limit: mersPerBatch="uint64FMT" segmentLimit="uint64FMT"\n", args->mersPerBatch, args->segmentLimit);
+ } else {
+ args->mersPerBatch = args->numMersActual;
+ args->segmentLimit = 1;
+ if (args->beVerbose)
+ fprintf(stderr, "Have NO LIMITS!: mersPerBatch="uint64FMT" segmentLimit="uint64FMT"\n", args->mersPerBatch, args->segmentLimit);
+ }
+
+ args->basesPerBatch = (uint64)ceil((double)args->numBasesActual / (double)args->segmentLimit);
+ if (args->beVerbose)
+ fprintf(stderr, "basesPerBatch = "uint64FMT"\n", args->basesPerBatch);
+
+ // Choose the optimal number of buckets to reduce memory usage.
+ // Yes, this is already done in estimateNumMersInMemorySize() (but
+ // not saved) and we need to do it for the other cases anyway.
+ //
+ // We use the number of mers per batch + 1 because we need to store
+ // the first position after the last mer. That is, if there are
+ // two mers, we will store that the first mer is at position 0, the
+ // second mer is at position 1, and the end of the second mer is at
+ // position 2.
+ //
+ args->bucketPointerWidth = logBaseTwo64(args->basesPerBatch + 1);
+ args->numBuckets_log2 = optimalNumberOfBuckets(args->merSize, args->basesPerBatch, args->positionsEnabled);
+ args->numBuckets = (uint64ONE << args->numBuckets_log2);
+ args->merDataWidth = args->merSize * 2 - args->numBuckets_log2;
+ //args->bucketPointerMask = uint64MASK(args->numBuckets_log2);
+
+
+ if (args->merDataWidth > SORTED_LIST_WIDTH * 64) {
+ fprintf(stderr, " numMersActual = "uint64FMT"\n", args->numMersActual);
+ fprintf(stderr, " mersPerBatch = "uint64FMT"\n", args->mersPerBatch);
+ fprintf(stderr, " basesPerBatch = "uint64FMT"\n", args->basesPerBatch);
+ fprintf(stderr, " numBuckets = "uint64FMT" ("uint32FMT" bits)\n", args->numBuckets, args->numBuckets_log2);
+ fprintf(stderr, " bucketPointerWidth = "uint32FMT"\n", args->bucketPointerWidth);
+ fprintf(stderr, " merDataWidth = "uint32FMT"\n", args->merDataWidth);
+ fprintf(stderr, "Sorry! merSize too big! Increase KMER_WORDS in libbio.kmer.H\n");
+ exit(1);
+ }
+
+ if (args->beVerbose) {
+ if (args->memoryLimit)
+ fprintf(stderr, "Computing "uint64FMT" segments using "uint64FMT"MB memory each.\n",
+ args->segmentLimit, args->memoryLimit);
+ else
+ fprintf(stderr, "Computing "uint64FMT" segments using AS MUCH MEMORY AS NEEDED.\n",
+ args->segmentLimit);
+ fprintf(stderr, " numMersActual = "uint64FMT"\n", args->numMersActual);
+ fprintf(stderr, " mersPerBatch = "uint64FMT"\n", args->mersPerBatch);
+ fprintf(stderr, " basesPerBatch = "uint64FMT"\n", args->basesPerBatch);
+ fprintf(stderr, " numBuckets = "uint64FMT" ("uint32FMT" bits)\n", args->numBuckets, args->numBuckets_log2);
+ fprintf(stderr, " bucketPointerWidth = "uint32FMT"\n", args->bucketPointerWidth);
+ fprintf(stderr, " merDataWidth = "uint32FMT"\n", args->merDataWidth);
+ }
+}
+
+
+
+
+void
+runSegment(merylArgs *args, uint64 segment) {
+ merStream *M = 0L;
+ merylStreamWriter *W = 0L;
+ speedCounter *C = 0L;
+ uint32 *bucketSizes = 0L;
+ uint64 *bucketPointers = 0L;
+ uint64 *merDataArray[SORTED_LIST_WIDTH] = { 0L };
+ uint32 *merPosnArray = 0L;
+
+ // If this segment exists already, skip it.
+ //
+ // XXX: This should be a command line option.
+ // XXX: This should check that the files are complete meryl files.
+ //
+ char *filename = new char [strlen(args->outputFile) + 17];
+ sprintf(filename, "%s.batch"uint64FMT".mcdat", args->outputFile, segment);
+
+ if (fileExists(filename)) {
+ if (args->beVerbose)
+ fprintf(stderr, "Found result for batch "uint64FMT" in %s.\n", segment, filename);
+ delete [] filename;
+ return;
+ }
+
+ if ((args->beVerbose) && (args->segmentLimit > 1))
+ fprintf(stderr, "Computing segment "uint64FMT" of "uint64FMT".\n", segment+1, args->segmentLimit);
+
+ delete [] filename;
+
+
+
+ //
+ // We can do all allocations up front:
+ // mer data storage (the buckets themselves, plus 64 for slop)
+ // bucket pointers (plus an extra bucket at the end and a little for slop)
+ // bucket size counting space, last because we toss it out quickly
+ //
+ if (args->beVerbose)
+ fprintf(stderr, " Allocating "uint64FMT"MB for mer storage ("uint32FMT" bits wide).\n",
+ (args->basesPerBatch * args->merDataWidth + 64) >> 23, args->merDataWidth);
+
+ // Mer storage - if mers are bigger than 32, we allocate full
+ // words. The last allocation is always a bitPacked array.
+
+ for (uint64 mword=0, width=args->merDataWidth; width > 0; ) {
+ if (width >= 64) {
+ merDataArray[mword] = new uint64 [ args->basesPerBatch + 1 ];
+ width -= 64;
+ mword++;
+ } else {
+ merDataArray[mword] = new uint64 [ (args->basesPerBatch * width + 64) >> 6 ];
+ width = 0;
+ }
+ }
+
+ if (args->positionsEnabled) {
+ if (args->beVerbose)
+ fprintf(stderr, " Allocating "uint64FMT"MB for mer position storage.\n",
+ (args->basesPerBatch * 32 + 32) >> 23);
+ merPosnArray = new uint32 [ args->basesPerBatch + 1 ];
+ }
+
+ if (args->beVerbose)
+ fprintf(stderr, " Allocating "uint64FMT"MB for bucket pointer table ("uint32FMT" bits wide).\n",
+ (args->numBuckets * args->bucketPointerWidth + 128) >> 23, args->bucketPointerWidth);
+ bucketPointers = new uint64 [(args->numBuckets * args->bucketPointerWidth + 128) >> 6];
+
+
+ if (args->beVerbose)
+ fprintf(stderr, " Allocating "uint64FMT"MB for counting the size of each bucket.\n", args->numBuckets >> 18);
+ bucketSizes = new uint32 [ args->numBuckets ];
+ for (uint64 i=args->numBuckets; i--; )
+ bucketSizes[i] = uint32ZERO;
+
+
+ // Position the mer stream at the start of this segments' mers.
+ // The last segment goes until the stream runs out of mers,
+ // everybody else does args->basesPerBatch mers.
+
+ C = new speedCounter(" Counting mers in buckets: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, args->beVerbose);
+ M = new merStream(new kMerBuilder(args->merSize, args->merComp),
+ new seqStream(args->inputFile),
+ true, true);
+ M->setBaseRange(args->basesPerBatch * segment, args->basesPerBatch * segment + args->basesPerBatch);
+
+ char mstring[256];
+
+ if (args->doForward) {
+ while (M->nextMer()) {
+ //fprintf(stderr, "FMER %s\n", M->theFMer().merToString(mstring));
+ bucketSizes[ args->hash(M->theFMer()) ]++;
+ C->tick();
+ }
+ }
+
+ if (args->doReverse) {
+ while (M->nextMer()) {
+ //fprintf(stderr, "RMER %s\n", M->theRMer().merToString(mstring));
+ bucketSizes[ args->hash(M->theRMer()) ]++;
+ C->tick();
+ }
+ }
+
+ if (args->doCanonical) {
+ while (M->nextMer()) {
+ if (M->theFMer() <= M->theRMer()) {
+ //fprintf(stderr, "FMER %s\n", M->theFMer().merToString(mstring));
+ bucketSizes[ args->hash(M->theFMer()) ]++;
+ } else {
+ //fprintf(stderr, "RMER %s\n", M->theRMer().merToString(mstring));
+ bucketSizes[ args->hash(M->theRMer()) ]++;
+ }
+ C->tick();
+ }
+ }
+
+ delete C;
+ delete M;
+
+ // Create the hash index using the counts. The hash points
+ // to the end of the bucket; when we add a word, we move the
+ // hash bucket pointer down one.
+ //
+ // When done, we can deallocate the counting table.
+ //
+ if (args->beVerbose)
+ fprintf(stderr, " Creating bucket pointers.\n");
+
+ {
+ uint64 mi=0;
+ uint64 mj=0;
+ uint64 mc=0;
+
+ while (mi < args->numBuckets) {
+ mc += bucketSizes[mi++];
+ setDecodedValue(bucketPointers, mj, args->bucketPointerWidth, mc);
+ mj += args->bucketPointerWidth;
+ }
+
+ // Add the location of the end of the table. This is not
+ // modified when adding words, but is used to determine
+ // the size of the last bucket.
+ //
+ setDecodedValue(bucketPointers, mj, args->bucketPointerWidth, mc);
+ }
+
+
+ // All done with the counting table, get rid of it.
+ //
+ if (args->beVerbose)
+ fprintf(stderr, " Releasing "uint64FMT"MB from counting the size of each bucket.\n", args->numBuckets >> 18);
+ delete [] bucketSizes;
+
+
+ C = new speedCounter(" Filling mers into list: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, args->beVerbose);
+ M = new merStream(new kMerBuilder(args->merSize, args->merComp),
+ new seqStream(args->inputFile),
+ true, true);
+ M->setBaseRange(args->basesPerBatch * segment, args->basesPerBatch * segment + args->basesPerBatch);
+
+ while (M->nextMer()) {
+
+ kMer const &m = ((args->doReverse) || (args->doCanonical && (M->theFMer() > M->theRMer()))) ?
+ M->theRMer()
+ :
+ M->theFMer();
+
+ uint64 element = preDecrementDecodedValue(bucketPointers,
+ args->hash(m) * args->bucketPointerWidth,
+ args->bucketPointerWidth);
+
+#if SORTED_LIST_WIDTH == 1
+ // Even though this would work in the general loop below, we
+ // special case one word mers to avoid the loop overhead.
+ //
+ setDecodedValue(merDataArray[0],
+ element * args->merDataWidth,
+ args->merDataWidth,
+ m.endOfMer(args->merDataWidth));
+#else
+ for (uint64 mword=0, width=args->merDataWidth; width>0; ) {
+ if (width >= 64) {
+ merDataArray[mword][element] = m.getWord(mword);
+ width -= 64;
+ mword++;
+ } else {
+ setDecodedValue(merDataArray[mword],
+ element * width,
+ width,
+ m.getWord(mword) & uint64MASK(width));
+ width = 0;
+ }
+ }
+#endif
+
+ if (args->positionsEnabled)
+ merPosnArray[element] = M->thePositionInStream();
+
+ C->tick();
+ }
+
+ delete C;
+ delete M;
+
+ char *batchOutputFile = new char [strlen(args->outputFile) + 33];
+ sprintf(batchOutputFile, "%s.batch"uint64FMT, args->outputFile, segment);
+
+ C = new speedCounter(" Writing output: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, args->beVerbose);
+ W = new merylStreamWriter((args->segmentLimit == 1) ? args->outputFile : batchOutputFile,
+ args->merSize, args->merComp,
+ args->numBuckets_log2,
+ args->positionsEnabled);
+
+ // Sort each bucket into sortedList, then output the mers
+ //
+ sortedList_t *sortedList = 0L;
+ uint32 sortedListMax = 0;
+ uint32 sortedListLen = 0;
+
+ for (uint64 bucket=0, bucketPos=0; bucket < args->numBuckets; bucket++) {
+ uint64 st = getDecodedValue(bucketPointers, bucketPos, args->bucketPointerWidth);
+ bucketPos += args->bucketPointerWidth;
+ uint64 ed = getDecodedValue(bucketPointers, bucketPos, args->bucketPointerWidth);
+
+ if (ed < st) {
+ fprintf(stderr, "ERROR: In segment "uint64FMT"\n", segment);
+ fprintf(stderr, "ERROR: Bucket "uint64FMT" (out of "uint64FMT") ends before it starts!\n",
+ bucket, args->numBuckets);
+ fprintf(stderr, "ERROR: start="uint64FMT"\n", st);
+ fprintf(stderr, "ERROR: end ="uint64FMT"\n", ed);
+ }
+ assert(ed >= st);
+
+ if ((ed - st) > (uint64ONE << 30)) {
+ fprintf(stderr, "ERROR: In segment "uint64FMT"\n", segment);
+ fprintf(stderr, "ERROR: Bucket "uint64FMT" (out of "uint64FMT") is HUGE!\n",
+ bucket, args->numBuckets);
+ fprintf(stderr, "ERROR: start="uint64FMT"\n", st);
+ fprintf(stderr, "ERROR: end ="uint64FMT"\n", ed);
+ }
+
+ // Nothing here? Keep going.
+ if (ed == st)
+ continue;
+
+ sortedListLen = (uint32)(ed - st);
+
+ // Allocate more space, if we need to.
+ //
+ if (sortedListLen > sortedListMax) {
+ delete [] sortedList;
+ sortedList = new sortedList_t [2 * sortedListLen + 1];
+ sortedListMax = 2 * sortedListLen;
+ }
+
+ // Clear out the sortedList -- if we don't, we leave the high
+ // bits unset which will probably make the sort random.
+ //
+ bzero(sortedList, sizeof(sortedList_t) * sortedListLen);
+
+ // Unpack the mers into the sorting array
+ //
+ if (args->positionsEnabled)
+ for (uint64 i=st; i<ed; i++)
+ sortedList[i-st]._p = merPosnArray[i];
+
+#if SORTED_LIST_WIDTH == 1
+ for (uint64 i=st, J=st*args->merDataWidth; i<ed; i++, J += args->merDataWidth)
+ sortedList[i-st]._w = getDecodedValue(merDataArray[0], J, args->merDataWidth);
+#else
+ for (uint64 i=st; i<ed; i++) {
+ for (uint64 mword=0, width=args->merDataWidth; width>0; ) {
+ if (width >= 64) {
+ sortedList[i-st]._w[mword] = merDataArray[mword][i];
+ width -= 64;
+ mword++;
+ } else {
+ sortedList[i-st]._w[mword] = getDecodedValue(merDataArray[mword], i * width, width);
+ width = 0;
+ }
+ }
+ }
+#endif
+
+ // Sort if there is more than one item
+ //
+ if (sortedListLen > 1) {
+ for (int64 t=(sortedListLen-2)/2; t>=0; t--)
+ adjustHeap(sortedList, t, sortedListLen);
+
+ for (int64 t=sortedListLen-1; t>0; t--) {
+ sortedList_t tv = sortedList[t];
+ sortedList[t] = sortedList[0];
+ sortedList[0] = tv;
+
+ adjustHeap(sortedList, 0, t);
+ }
+ }
+
+ // Dump the list of mers to the file.
+ //
+ kMer mer(args->merSize);
+
+ for (uint32 t=0; t<sortedListLen; t++) {
+ C->tick();
+
+ // Build the complete mer
+ //
+#if SORTED_LIST_WIDTH == 1
+ mer.setWord(0, sortedList[t]._w);
+#else
+ for (uint64 mword=0; mword < SORTED_LIST_WIDTH; mword++)
+ mer.setWord(mword, sortedList[t]._w[mword]);
+#endif
+ mer.setBits(args->merDataWidth, args->numBuckets_log2, bucket);
+
+ // Add it
+ if (args->positionsEnabled)
+ W->addMer(mer, 1, &sortedList[t]._p);
+ else
+ W->addMer(mer, 1, 0L);
+
+ }
+ }
+
+ delete [] sortedList;
+
+ delete C;
+ delete W;
+
+ delete [] batchOutputFile;
+
+ for (uint32 x=0; x<SORTED_LIST_WIDTH; x++)
+ delete [] merDataArray[x];
+
+ delete [] merPosnArray;
+
+ delete [] bucketPointers;
+
+ if (args->beVerbose)
+ fprintf(stderr, "Segment "uint64FMT" finished.\n", segment);
+}
+
+
+
+void
+build(merylArgs *args) {
+
+ if (!args->countBatch && !args->mergeBatch)
+ prepareBatch(args);
+
+ // Three choices:
+ //
+ // threaded -- start threads, launch pieces in each thread. This
+ // thread waits for completion and then merges the results.
+ //
+ // batched -- write info file and exit. Compute and merge is done
+ // on separate invocations.
+ //
+ // segmented -- write info file, then do each piece sequentially.
+ // After all pieces finished, do a merge.
+ //
+ //
+
+ bool doMerge = false;
+
+ if (args->configBatch) {
+
+ // Write out our configuration and exit if we are -configbatch
+ //
+ args->writeConfig();
+
+ if (args->sgeJobName) {
+ fprintf(stdout, "Batch prepared. Submitting to the grid.\n");
+ submitCountBatches(args);
+ } else {
+ fprintf(stdout, "Batch prepared. Please run:\n");
+ for (uint64 s=0; s<args->segmentLimit; s++)
+ fprintf(stdout, "%s -countbatch "uint64FMT" -o %s\n", args->execName, s, args->outputFile);
+ fprintf(stdout, "%s -mergebatch -o %s\n", args->execName, args->outputFile);
+ }
+ } else if (args->countBatch) {
+
+ // Read back the configuration, run the segment and exit if we
+ // are -countbatch
+ //
+ merylArgs *savedArgs = new merylArgs(args->outputFile);
+ savedArgs->beVerbose = args->beVerbose;
+ runSegment(savedArgs, args->batchNumber);
+ delete savedArgs;
+ } else if (args->mergeBatch) {
+
+ // Check that all the files exist if we are -mergebatch and
+ // continue with execution
+ //
+ // MEMORY LEAK! We should delete this at the end of the
+ // function, but it's a pain, and who cares?
+ //
+ merylArgs *savedArgs = new merylArgs(args->outputFile);
+ savedArgs->beVerbose = args->beVerbose;
+
+ args = savedArgs;
+
+ doMerge = true;
+ } else {
+
+ if (args->numThreads > 1)
+
+ // Run, using threads. There is a lot of baloney needed, so it's
+ // all in a separate function.
+ //
+ runThreaded(args);
+ else
+ // No special options given, do all the work here and now
+ //
+ for (uint64 s=0; s<args->segmentLimit; s++)
+ runSegment(args, s);
+
+ // Either case, we want to merge now.
+ //
+ doMerge = true;
+ }
+
+
+ // If there is more than one segment, merge them to get the output.
+ //
+ // We do this by contructing a meryl command line and recursively
+ // (effectively) calling meryl.
+ //
+ // The command line is
+ //
+ // ./meryl -M merge [-v] -s batch1 -s batch2 ... -s batchN -o outputFile
+ //
+ if ((doMerge) && (args->segmentLimit > 1)) {
+
+ if (args->beVerbose)
+ fprintf(stderr, "Merge results.\n");
+
+ int argc = 0;
+ char **argv = new char* [7 + 2 * args->segmentLimit];
+ bool *arga = new bool [7 + 2 * args->segmentLimit];
+
+ arga[argc] = false; argv[argc++] = "meryl-build-merge";
+ arga[argc] = false; argv[argc++] = "-M";
+ arga[argc] = false; argv[argc++] = "merge";
+
+ if (args->beVerbose) {
+ arga[argc] = false;
+ argv[argc++] = "-v";
+ }
+
+ for (uint32 i=0; i<args->segmentLimit; i++) {
+ arga[argc] = false;
+ argv[argc++] = "-s";
+ arga[argc] = true;
+ argv[argc] = new char [strlen(args->outputFile) + 33];
+ sprintf(argv[argc], "%s.batch"uint32FMT, args->outputFile, i);
+ argc++;
+ }
+
+ arga[argc] = false; argv[argc++] = "-o";
+ arga[argc] = false; argv[argc++] = args->outputFile;
+
+ merylArgs *addArgs = new merylArgs(argc, argv);
+ multipleOperations(addArgs);
+
+ // Cleanup the memory leak.
+ //
+ delete addArgs;
+ for (int i=0; i<argc; i++)
+ if (arga[i])
+ delete [] argv[i];
+ delete [] argv;
+ delete [] arga;
+
+ // Remove temporary files
+ //
+ char *filename = new char [strlen(args->outputFile) + 17];
+
+ for (uint32 i=0; i<args->segmentLimit; i++) {
+ sprintf(filename, "%s.batch"uint32FMT".mcidx", args->outputFile, i);
+ unlink(filename);
+ sprintf(filename, "%s.batch"uint32FMT".mcdat", args->outputFile, i);
+ unlink(filename);
+ sprintf(filename, "%s.batch"uint32FMT".mcpos", args->outputFile, i);
+ unlink(filename);
+ }
+
+ delete [] filename;
+ }
+
+ // If we just merged, delete the merstream file
+ //
+ if (doMerge) {
+ char *filename = new char [strlen(args->outputFile) + 17];
+
+ sprintf(filename, "%s.merStream", args->outputFile);
+ unlink(filename);
+
+ delete [] filename;
+ }
+}
diff --git a/meryl/compare-counts.C b/meryl/compare-counts.C
new file mode 100644
index 0000000..c38d1f1
--- /dev/null
+++ b/meryl/compare-counts.C
@@ -0,0 +1,233 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "libmeryl.H"
+
+
+
+
+#if 0
+void
+heatMap() {
+ speedCounter *C = new speedCounter(" Examining: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1ffffff, false);
+
+#define MAXA 150
+#define MAXB 150
+
+ double heatraw[MAXA][MAXB];
+ double heatsca[MAXA][MAXB];
+
+ for (uint32 i=0; i<MAXA; i++)
+ for (uint32 j=0; j<MAXB; j++)
+ heatraw[i][j] = heatsca[i][j] = 0;
+
+ A->nextMer();
+ B->nextMer();
+
+ while ((A->validMer()) ||
+ (B->validMer())) {
+ kMer &a = A->theFMer();
+ kMer &b = B->theFMer();
+
+ uint32 ac = A->theCount();
+ uint32 bc = B->theCount();
+
+ if (ac >= MAXA)
+ ac = MAXA-1;
+
+ if (bc >= MAXB)
+ bc = MAXB-1;
+
+ if (A->validMer() == false) {
+ ac = 0;
+ heatraw[ac][bc]++;
+ B->nextMer();
+ continue;
+ }
+
+ if (B->validMer() == false) {
+ bc = 0;
+ heatraw[ac][bc]++;
+ A->nextMer();
+ continue;
+ }
+
+ if (a == b) {
+ heatraw[ac][bc]++;
+ A->nextMer();
+ B->nextMer();
+
+ } else if (a < b) {
+ heatraw[ac][0]++;
+ A->nextMer();
+
+ } else {
+ heatraw[0][bc]++;
+ B->nextMer();
+ }
+
+ C->tick();
+ }
+
+ delete C;
+ delete A;
+ delete B;
+
+ // Scale each row to be between 0 and 1
+
+#if 0
+ for (uint32 j=0; j<MAXB; j++) {
+ double mina = heatraw[0][j];
+ double maxa = heatraw[0][j];
+
+ for (uint32 ii=0; ii<MAXA; ii++) {
+ if (maxa < heatraw[ii][j])
+ maxa = heatraw[ii][j];
+ if (heatraw[ii][j] < mina)
+ mina = heatraw[ii][j];
+ }
+
+ for (uint32 i=0; i<MAXA; i++)
+ heatsca[i][j] = (heatraw[i][j] - mina) / (maxa - mina);
+ }
+#endif
+
+
+ for (uint32 i=0; i<MAXA; i++)
+ for (uint32 j=0; j<MAXB; j++)
+ fprintf(stdout, uint32FMT"\t"uint32FMT"\t%f\n", i, j, log(heatraw[i][j]));
+}
+#endif
+
+
+
+
+
+int
+main(int argc, char **argv) {
+ merylStreamReader *T = 0L;
+ merylStreamReader *S = 0L;
+ char *outputPrefix = NULL;
+ char *plotTitle = NULL;
+ int arg = 1;
+ int err = 0;
+
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-truth") == 0) {
+ T = new merylStreamReader(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-sample") == 0) {
+ S = new merylStreamReader(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-output") == 0) {
+ outputPrefix = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-title") == 0) {
+ plotTitle = argv[++arg];
+
+ } else {
+ fprintf(stderr, "unknown option '%s'\n", argv[arg]);
+ err++;
+ }
+
+ arg++;
+ }
+
+ if ((T == 0L) || (S == 0L) || (outputPrefix == 0L) || (plotTitle == 0L) || (err)) {
+ fprintf(stderr, "usage: %s [opts]\n", argv[0]);
+ fprintf(stderr, " -truth k-mers from reference\n");
+ fprintf(stderr, " -sample k-mers from sample\n");
+ fprintf(stderr, " -output output prefix\n");
+ fprintf(stderr, " -title plot label\n");
+ exit(1);
+ }
+
+ uint32 kmerSize = T->merSize();
+
+#define HMAX 64 * 1024
+
+ uint32 *Htrue = new uint32 [HMAX];
+ uint32 *Hnoise = new uint32 [HMAX];
+
+ for (uint32 i=0; i<HMAX; i++)
+ Htrue[i] = Hnoise[i] = 0;
+
+ T->nextMer();
+ S->nextMer();
+
+ while ((T->validMer()) ||
+ (S->validMer())) {
+ kMer &t = T->theFMer();
+ kMer &s = S->theFMer();
+
+ uint32 tc = T->theCount();
+ uint32 sc = S->theCount();
+
+ if (tc >= HMAX) tc = HMAX-1;
+ if (sc >= HMAX) sc = HMAX-1;
+
+ // If we're out of truth kmers, the sample is noise.
+ if (T->validMer() == false) {
+ Hnoise[sc]++;
+ S->nextMer();
+ continue;
+ }
+
+ // If we're out of sample kmers, do nothing but go to the next truth kmer.
+ if (S->validMer() == false) {
+ T->nextMer();
+ continue;
+ }
+
+ // If the kmers are equal, this is a true kmer
+ if (t == s) {
+ Htrue[sc]++;
+ T->nextMer();
+ S->nextMer();
+ }
+
+ // If the truth kmer is the lesser, get the next truth.
+ else if (t < s) {
+ T->nextMer();
+ }
+
+ // Else the sample kmer is smaller, add it to the noise pile, and get the next.
+ else {
+ Hnoise[sc]++;
+ S->nextMer();
+ }
+ }
+
+ delete T;
+ delete S;
+
+ char outputName[FILENAME_MAX];
+
+ sprintf(outputName, "%s.gp", outputPrefix);
+ FILE *outputGP = fopen(outputName, "w");
+
+ sprintf(outputName, "%s.dat", outputPrefix);
+ FILE *outputDAT = fopen(outputName, "w");
+
+ fprintf(outputGP, "set terminal png\n");
+ fprintf(outputGP, "set output \"%s.png\"\n", outputPrefix);
+ fprintf(outputGP, "set title \"%s true/false %d-mers\"\n", plotTitle, kmerSize);
+ fprintf(outputGP, "set xlabel \"k-mer count\"\n");
+ fprintf(outputGP, "set ylabel \"number of kmers\"\n");
+ fprintf(outputGP, "plot [0:100] [0:1000000] \"%s.dat\" using 1:2 with lines title \"true\", \"%s.dat\" using 1:3 with lines title \"false\"\n",
+ outputPrefix, outputPrefix);
+
+ fclose(outputGP);
+
+ for (uint32 i=0; i<HMAX; i++)
+ fprintf(outputDAT, uint32FMT"\t"uint32FMT"\t"uint32FMT"\n", i, Htrue[i], Hnoise[i]);
+
+ fclose(outputDAT);
+
+ sprintf(outputName, "gnuplot < %s.gp", outputPrefix);
+ system(outputName);
+
+ exit(0);
+}
diff --git a/meryl/dump.C b/meryl/dump.C
new file mode 100644
index 0000000..ff991bc
--- /dev/null
+++ b/meryl/dump.C
@@ -0,0 +1,156 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "meryl.H"
+#include "libmeryl.H"
+
+#include <algorithm>
+
+void
+dumpThreshold(merylArgs *args) {
+ merylStreamReader *M = new merylStreamReader(args->inputFile);
+ char str[1025];
+
+ while (M->nextMer()) {
+ if (M->theCount() >= args->numMersEstimated)
+ fprintf(stdout, ">"uint64FMT"\n%s\n",
+ M->theCount(),
+ M->theFMer().merToString(str));
+ }
+
+ delete M;
+}
+
+
+void
+dumpPositions(merylArgs *args) {
+ merylStreamReader *M = new merylStreamReader(args->inputFile);
+ char str[1025];
+
+ if (M->hasPositions() == false) {
+ fprintf(stderr, "File '%s' contains no position information.\n", args->inputFile);
+ } else {
+ while (M->nextMer()) {
+ fprintf(stdout, ">"uint64FMT, M->theCount());
+ for (uint32 i=0; i<M->theCount(); i++)
+ fprintf(stdout, " "uint32FMT, M->getPosition(i));
+ fprintf(stdout, "\n%s\n", M->theFMer().merToString(str));
+ }
+ }
+
+ delete M;
+}
+
+
+void
+countUnique(merylArgs *args) {
+ merylStreamReader *M = new merylStreamReader(args->inputFile);
+
+#warning make this a test
+#if 0
+ uint64 numDistinct = 0;
+ uint64 numUnique = 0;
+ uint64 numMers = 0;
+ uint64 c = 0;
+
+ while (M->nextMer()) {
+ c = M->theCount();
+
+ numDistinct++;
+ if (c == 1)
+ numUnique++;
+ numMers += c;
+ }
+
+ assert(numMers == M->numberOfTotalMers());
+ assert(numDistinct == M->numberOfDistinctMers());
+ assert(numUnique == M->numberOfUniqueMers());
+ fprintf(stderr, "OK\n");
+#endif
+
+ fprintf(stdout, "Found "uint64FMT" mers.\n", M->numberOfTotalMers());
+ fprintf(stdout, "Found "uint64FMT" distinct mers.\n", M->numberOfDistinctMers());
+ fprintf(stdout, "Found "uint64FMT" unique mers.\n", M->numberOfUniqueMers());
+
+ delete M;
+}
+
+
+void
+plotHistogram(merylArgs *args) {
+ uint64 distinct = 0;
+ uint64 total = 0;
+
+ merylStreamReader *M = new merylStreamReader(args->inputFile);
+
+ fprintf(stderr, "Found "uint64FMT" mers.\n", M->numberOfTotalMers());
+ fprintf(stderr, "Found "uint64FMT" distinct mers.\n", M->numberOfDistinctMers());
+ fprintf(stderr, "Found "uint64FMT" unique mers.\n", M->numberOfUniqueMers());
+
+ fprintf(stderr, "Largest mercount is "uint64FMT"; "uint64FMT" mers are too big for histogram.\n",
+ M->histogramMaximumCount(), M->histogramHuge());
+
+ for (uint32 i=1; i<M->histogramLength(); i++) {
+ uint64 hist = M->histogram(i);
+
+ if (hist > 0) {
+ distinct += hist;
+ total += hist * i;
+
+ fprintf(stdout, uint32FMT"\t"uint64FMT"\t%.4f\t%.4f\n",
+ i,
+ hist,
+ distinct / (double)M->numberOfDistinctMers(),
+ total / (double)M->numberOfTotalMers());
+ }
+ }
+
+ delete M;
+}
+
+
+
+void
+dumpDistanceBetweenMers(merylArgs *args) {
+ merylStreamReader *M = new merylStreamReader(args->inputFile);
+
+ // This is now tough because we don't know where the sequences end,
+ // and our positions encode position in the chain.
+
+ uint32 histMax = 64 * 1024 * 1024;
+ uint64 *hist = new uint64 [histMax];
+ uint64 histHuge = 0;
+
+ if (M->hasPositions() == false) {
+ fprintf(stderr, "File '%s' contains no position information.\n", args->inputFile);
+ } else {
+ while (M->nextMer()) {
+ std::sort(M->thePositions(), M->thePositions() + M->theCount());
+
+ for (uint32 i=1; i<M->theCount(); i++) {
+ uint32 d = M->getPosition(i) - M->getPosition(i-1);
+ if (d < histMax)
+ hist[d]++;
+ else
+ histHuge++;
+ }
+ }
+
+ uint32 maxd = 0;
+
+ for (uint32 d=0; d<histMax; d++)
+ if (hist[d])
+ maxd = d+1;
+
+ for (uint32 d=0; d<maxd; d++)
+ if (hist[d])
+ fprintf(stderr, uint32FMT"\t"uint64FMT"\n", d, hist[d]);
+
+ if (histHuge)
+ fprintf(stderr, "huge\t"uint64FMT"\n", histHuge);
+ }
+
+ delete [] hist;
+ delete M;
+}
diff --git a/meryl/estimate.C b/meryl/estimate.C
new file mode 100644
index 0000000..951d69c
--- /dev/null
+++ b/meryl/estimate.C
@@ -0,0 +1,182 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "bio++.H"
+#include "seqStream.H"
+#include "merStream.H"
+#include "libmeryl.H"
+#include "meryl.H"
+
+// Takes a memory limit in MB, returns the number of mers that we can
+// fit in that memory size, assuming optimalNumberOfBuckets() below
+// uses the same algorithm.
+//
+uint64
+estimateNumMersInMemorySize(uint32 merSize,
+ uint32 mem,
+ bool positionsEnabled,
+ bool beVerbose) {
+ uint64 maxN = 0;
+ uint64 bestT = 0;
+
+
+ // For each possible number of buckets, try all poissible pointer
+ // widths. First we compute the number of mers that fit in a
+ // bucket pointer table of size 2^t storing N bits in the mer data
+ // table, then we check that the number of mers in the mer data
+ // table agrees with the width of the pointer table.
+
+
+ // This is the memory size we are trying to fill, in bits.
+ //
+ uint64 memLimt = ((uint64)mem) << 23;
+
+ // Positions consume space too, but only if enabled.
+ //
+ uint64 posPerMer = 0;
+ if (positionsEnabled)
+ posPerMer = 32;
+
+ // Limit the number of entries in the bucket pointer table to
+ // 50 bits -- thus, the prefix of each mer is at most 25.
+ //
+ uint32 tMax = 2*merSize - 2;
+ if (tMax > 50)
+ tMax = 50;
+
+ for (uint64 t=2; t < tMax; t++) {
+
+ // We need to try all possibilities of N, the width of the
+ // bucket pointer table === log2(numMers).
+ //
+ // Increased to 40 bits, so we're valid up to 1 trillion mers.
+ //
+ for (uint64 N=1; N<40; N++) {
+ uint64 Nmin = uint64ONE << (N - 1);
+ uint64 Nmax = uint64ONE << (N);
+
+ // The size in bits of the bucket pointer table.
+ //
+ uint64 bucketsize = (uint64ONE << t) * N;
+
+ // If our bucket pointer table size hasn't already blown our
+ // memory limit, compute the number of mers that we can stuff
+ // into the list.
+ //
+ if (memLimt > bucketsize) {
+
+ // The number of mers we can then fit into the mer data table
+ // is easy to compute.
+ //
+ // Even though we allocate merDataArray, bucketPointers,
+ // bucketSizes, we don't use merDataArray until after we
+ // release bucketSizes, and so we only estimate the maximum
+ // in core (not allocated) size.
+ //
+ uint64 n = (memLimt - bucketsize) / (2*merSize - t + posPerMer);
+
+ // We can stop now if our computed number of mers is outside the range that
+ // the bucket pointer table can address.
+ //
+ if ((Nmin <= n) && (n <= Nmax)) {
+
+ //fprintf(stderr, "prefixSize="uint64FMTW(2)" numMers="uint64FMTW(9)" memory=%.3fMB\n",
+ // t, n,
+ // (((uint64ONE << t) * logBaseTwo64(n) + n * (2*merSize - t + posPerMer)) >> 3) / 1048576.0);
+
+ // Remember the settings with the highest number of mers, if
+ // more than zero mers.
+ //
+ if ((n > 0) &&
+ (maxN < n)) {
+ maxN = n;
+ bestT = t;
+ }
+
+ }
+ }
+ }
+ }
+
+ if (beVerbose)
+ fprintf(stdout, "Can fit "uint64FMT" mers into table with prefix of "uint64FMT" bits, using %8.3fMB (%8.3fMB for positions)\n",
+ maxN,
+ bestT,
+ (((uint64ONE << bestT) * logBaseTwo64(maxN) + maxN * (2*merSize - bestT + posPerMer)) >> 3) / 1048576.0,
+ ((maxN * posPerMer) >> 3) / 1048576.0);
+
+ return(maxN);
+}
+
+
+
+uint32
+optimalNumberOfBuckets(uint32 merSize,
+ uint64 numMers,
+ bool positionsEnabled) {
+ uint64 opth = ~uint64ZERO;
+ uint64 opts = ~uint64ZERO;
+ uint64 h = 0;
+ uint64 s = 0;
+ uint64 hwidth = logBaseTwo64(numMers);
+
+ // Positions consume space too, but only if enabled. Probably
+ // doesn't matter here.
+ //
+ uint64 posPerMer = 0;
+ if (positionsEnabled)
+ posPerMer = 32;
+
+ // Find the table size (in bits, h) that minimizes memory usage
+ // for the given merSize and numMers
+ //
+ // We have two tables:
+ // the bucket pointers num buckets * pointer width == 2 << h * hwidth
+ // the mer data: num mers * (mersize - hwidth)
+ //
+ uint64 hmax = 64 - logBaseTwo64(hwidth + numMers * (2 * merSize - h));
+ for (h=2; h<=hmax && h<2*merSize; h++) {
+ s = (uint64ONE << h) * hwidth + numMers * (2 * merSize - h + posPerMer);
+
+ //fprintf(stderr, "optimalNumberOfBuckets()-- h="uint64FMT" s="uint64FMT"\n", h, s);
+
+ if (s < opts) {
+ opth = h;
+ opts = s;
+ }
+ }
+
+ return((uint32)opth);
+}
+
+
+
+void
+estimate(merylArgs *args) {
+
+ if (args->inputFile) {
+ merStream M(new kMerBuilder(args->merSize, args->merComp),
+ new seqStream(args->inputFile),
+ true, true);
+ speedCounter C(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, args->beVerbose);
+
+ if (args->beVerbose)
+ fprintf(stderr, "Counting mers in '%s'\n", args->inputFile);
+
+ args->numMersEstimated = 0;
+
+ while (M.nextMer()) {
+ C.tick();
+ args->numMersEstimated++;
+ }
+
+ C.finish();
+ }
+
+ uint32 opth = optimalNumberOfBuckets(args->merSize, args->numMersEstimated, args->positionsEnabled);
+ uint64 memu = ((uint64ONE << opth) * logBaseTwo64(args->numMersEstimated+1) +
+ args->numMersEstimated * (2 * args->merSize - opth));
+
+ fprintf(stderr, uint64FMT" "uint32FMT"-mers can be computed using "uint64FMT"MB memory.\n",
+ args->numMersEstimated, args->merSize, memu >> 23);
+}
diff --git a/meryl/m-heap.H b/meryl/m-heap.H
new file mode 100644
index 0000000..3202450
--- /dev/null
+++ b/meryl/m-heap.H
@@ -0,0 +1,152 @@
+#ifndef M_HEAP_H
+#define M_HEAP_H
+
+#include "util++.H"
+#include "bio++.H"
+
+//
+// This is a bit packed heap, derived from bitPackedHeap. It is
+// customized to decode a kmer from a merStream, given the location
+// of the kmer in the stream. This kmer is used for the value of the
+// data item in the heap, instead of the value stored in the heap.
+//
+
+class bitPackedMerHeap {
+public:
+ bitPackedMerHeap(seqStore *SS, uint32 width, uint64 size=16) {
+ _array = new bitPackedArray(width, size);
+ _array->set(0, 0);
+ _lastVal = 0;
+ _mers = SS;
+ };
+
+ ~bitPackedMerHeap() {
+ delete _array;
+ };
+
+ // Get the mer with index idx in the merStream
+ //
+ kMer const &getMer(uint64 idx) {
+ _mers->setIterationStart(idx);
+ _mers->nextMer();
+ if (_mers->theRMer() < _mers->theFMer())
+ return(_mers->theRMer());
+ return(_mers->theFMer());
+ }
+
+ uint64 get(kMer &mer) {
+ uint64 pos = ~uint64ZERO;
+
+ if (_lastVal == 0)
+ return(pos);
+
+ pos = _array->get(0);
+ mer = getMer(pos);
+
+ if (--_lastVal == 0)
+ return(pos);
+
+ // Rebalance the heap
+
+ uint64 tval = _array->get(_lastVal);
+ kMer tmer;
+
+ _array->set(0, tval);
+
+ uint64 pidx = 0;
+ uint64 pval = tval;
+ kMer pmer = getMer(pval);
+ uint64 cidx = 1;
+ uint64 cval = 0; // set below
+ kMer cmer;
+
+ while (cidx < _lastVal) {
+ // Set cval here, so we can first test if cidx is in range.
+ cval = _array->get(cidx);
+ cmer = getMer(cval);
+
+ // Pick the smallest of the two kids
+ if (cidx+1 < _lastVal) {
+ tval = _array->get(cidx+1);
+ tmer = getMer(tval);
+
+ if (cmer > tmer) {
+ cidx++;
+ cval = tval;
+ cmer = tmer;
+ }
+ }
+
+ if (cmer < pmer) {
+
+ // Swap p and c
+ _array->set(pidx, cval);
+ _array->set(cidx, pval);
+
+ // Move down the tree -- pval doesn't change, we moved it into cidx!
+ pidx = cidx;
+
+ cidx = cidx * 2 + 1;
+ } else {
+ cidx = _lastVal;
+ }
+ }
+
+ return(pos);
+ };
+
+ void add(uint64 value) {
+ uint64 cidx = _lastVal++;
+ uint64 cval = value;
+ kMer cmer;
+ uint64 pidx = 0;
+ uint64 pval = 0;
+ kMer pmer;
+ bool more = true;
+
+ _array->set(cidx, cval);
+
+ if (cidx == 0)
+ return;
+
+ cmer = getMer(cval);
+
+ while (more) {
+ pidx = (cidx-1) / 2;
+ pval = _array->get(pidx);
+ pmer = getMer(pval);
+
+ if (pmer > cmer) {
+
+ // Swap p and c
+ _array->set(cidx, pval);
+ _array->set(pidx, cval);
+
+ // Move up the tree -- cval doesn't change, we moved it into pidx!
+ cidx = pidx;
+ } else {
+ more = false;
+ }
+ if (cidx == 0)
+ more = false;
+ }
+ };
+
+ void dump(void) {
+ for (uint32 i=0; i<_lastVal; i++)
+ fprintf(stderr, "HEAP["uint32FMT"]="uint64FMT"\n", i, _array->get(i));
+ }
+
+ void clear(void) {
+ _array->clear();
+ _lastVal = 0;
+ };
+
+private:
+ bitPackedArray *_array;
+ uint64 _lastVal;
+ seqStore *_mers;
+};
+
+
+#endif // M_HEAP_H
diff --git a/meryl/m.C b/meryl/m.C
new file mode 100644
index 0000000..1167436
--- /dev/null
+++ b/meryl/m.C
@@ -0,0 +1,118 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "util++.H"
+#include "bio++.H"
+#include "meryl.H"
+
+#include "m-heap.H"
+
+int
+main(int argc, char **argv) {
+ bool beVerbose = false;
+ uint64 merSize = 20;
+ uint64 memLimit = 768;
+ char *inName = 0L;
+ char *outName = 0L;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-verbose", 2) == 0) {
+ beVerbose = true;
+ } else if (strncmp(argv[arg], "-mersize", 4) == 0) {
+ merSize = strtouint64(argv[++arg], 0L);
+ } else if (strncmp(argv[arg], "-memory", 4) == 0) {
+ memLimit = strtouint64(argv[++arg], 0L) * 1024 * 1024;
+ } else if (strncmp(argv[arg], "-sequence", 2) == 0) {
+ inName = argv[++arg];
+ } else {
+ fprintf(stderr, "unknown option '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ if (inName == 0L) {
+ fprintf(stderr, "usage: %s [-v] [-m mersize] [-memory Nmb] [-s seq.fasta]\n", argv[0]);
+ exit(1);
+ }
+
+ outName = new char [strlen(inName) + 1];
+ strcpy(outName, inName);
+
+ seqStream *seqstr = new seqStream(inName);
+ seqStore *seqsto = new seqStore(outName, seqstr);
+
+ uint64 memUsed = seqsto->loadStoreInCore();
+ uint64 numMers = seqsto->numberOfACGT();
+
+#warning needed exact number of mers here
+
+ fprintf(stderr, "Found "uint64FMT" mers in file of size "uint64FMT"\n", numMers, memUsed);
+
+ if (memUsed > memLimit) {
+ fprintf(stderr, "ERROR: two-bit encoded sequence file is bigger than allowed memory usage.\n");
+ exit(1);
+ }
+
+ // Allocate a heap to fill up the rest of space
+
+ // Allocate a bitPackedHeap to store N merSize*2 integeers.
+ // N = (memLimit - memUsed) * 8 / (merSize * 2)
+ //
+ // The bitPackedHeap doesn't care about the maximum size, only
+ // about the block size.
+ //
+ uint64 pointerWidth = logBaseTwo64(numMers);
+ bitPackedMerHeap *heap = new bitPackedMerHeap(seqsto, pointerWidth, 8 * 1024);
+
+ speedCounter *S;
+
+ uint64 N = (memLimit - memUsed) * 8 / pointerWidth;
+ uint64 M = 0;
+
+ fprintf(stderr, "Can store "uint64FMT" mer pointers of size "uint64FMT" in the heap.\n", N, pointerWidth);
+
+ kMer mer;
+
+ if (N > numMers)
+ N = numMers;
+
+ // Initialize the heap with some numbers
+ //
+ S = new speedCounter(" Loading heap: %7.2f Mmers -- %8.1f mers/second\r", 1.0, 0x1ffff, beVerbose);
+ while (M < N) {
+
+#if 0
+ heap->add(M);
+ heap->get(mer);
+ fprintf(stdout, "ADD "uint64FMT" -- %s\n", M, mer.merToString(str));
+#endif
+
+ heap->add(M++);
+ S->tick();
+ }
+ delete S;
+
+ // Until we run out of mers, write things out of the heap.
+ //
+ S = new speedCounter(" Cycling heap: %7.2f Mmers -- %8.1f Mmers/second\r", 1.0, 0x1fff, beVerbose);
+ while (M < numMers) {
+ heap->add(M++);
+ heap->get(mer);
+ //fprintf(stdout, "GOT "uint64FMT" -- %s\n", M, mer.merToString(str));
+ S->tick();
+ }
+ delete S;
+
+ // And finally, flush the heap.
+ //
+ S = new speedCounter(" Dumping heap: %7.2f Mmers -- %8.1f Mmers/second\r", 1.0, 0x1fff, beVerbose);
+ uint64 idx = heap->get(mer);
+ while (idx != ~uint64ZERO) {
+ //fprintf(stdout, "OUT "uint64FMT" -- %s\n", idx, mer.merToString(str));
+ idx = heap->get(mer);
+ S->tick();
+ }
+ delete S;
+}
diff --git a/meryl/mapMers-depth.C b/meryl/mapMers-depth.C
new file mode 100644
index 0000000..4a0b8f5
--- /dev/null
+++ b/meryl/mapMers-depth.C
@@ -0,0 +1,139 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bio++.H"
+#include "seqCache.H"
+#include "merStream.H"
+#include "libmeryl.H"
+#include "existDB.H"
+
+#warning this code might not work due to intervalList changes
+
+int
+main(int argc, char **argv) {
+ uint32 merSize = 16;
+ char *merylFile = 0L;
+ char *fastaFile = 0L;
+ bool beVerbose = false;
+ uint32 loCount = 0;
+ uint32 hiCount = ~uint32ZERO;
+ uint32 windowsize = 0;
+ uint32 skipsize = 0;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-m") == 0) {
+ merSize = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-mers") == 0) {
+ merylFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-seq") == 0) {
+ fastaFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-v") == 0) {
+ beVerbose = true;
+ } else if (strcmp(argv[arg], "-lo") == 0) {
+ loCount = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-hi") == 0) {
+ hiCount = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-w") == 0) {
+ windowsize = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-s") == 0) {
+ skipsize = strtouint32(argv[++arg], 0L);
+ } else {
+ fprintf(stderr, "unknown option '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ if ((merylFile == 0L) || (fastaFile == 0L)) {
+ fprintf(stderr, "usage: %s -m mersize -mers mers -seq fasta > output\n", argv[0]);
+ exit(1);
+ }
+
+ existDB *E = new existDB(merylFile, merSize, existDBcounts | existDBcompressCounts | existDBcompressBuckets, loCount, hiCount);
+ seqCache *F = new seqCache(fastaFile);
+
+ for (uint32 Sid=0; Sid < F->getNumberOfSequences(); Sid++) {
+ seqInCore *S = F->getSequenceInCore(Sid);
+ merStream *MS = new merStream(new kMerBuilder(merSize),
+ new seqStream(S->sequence(), S->sequenceLength()),
+ true, true);
+
+ uint32 idlen = 0;
+ intervalDepthRegions<uint64> *id = new intervalDepthRegions<uint64> [S->sequenceLength() * 2 + 2];
+
+ while (MS->nextMer()) {
+ int32 cnt = (int32)E->count(MS->theFMer()) + (int32)E->count(MS->theRMer());
+
+ // Old intervalDepth was to add 'cnt' in the first and subtract 'cnt' in the second.
+ // Then to use the 'ct' field below.
+ // New intervalDepth is the same, but uses the value field.
+ // Count is now the number of intervals that are represented in this block.
+
+ id[idlen].pos = MS->thePositionInSequence();
+ id[idlen].change = cnt;
+ id[idlen].open = true;
+ idlen++;
+
+ id[idlen].pos = MS->thePositionInSequence() + merSize;
+ id[idlen].change = cnt;
+ id[idlen].open = false;
+ idlen++;
+ }
+
+ intervalList<uint64> ID(id, idlen);
+ uint32 x = 0;
+
+ uint32 len = S->sequenceLength();
+
+ // Default case, report un-averaged depth at every single location.
+ //
+ if ((windowsize == 0) && (skipsize == 0)) {
+ for (uint32 i=0; i < ID.numberOfIntervals(); i++) {
+ for (; x < ID.lo(i); x++)
+ fprintf(stdout, uint32FMTW(7)"\t"uint32FMTW(6)"\n", x, 0);
+ for (; x < ID.hi(i); x++)
+ fprintf(stdout, uint32FMTW(7)"\t"uint32FMTW(6)"\n", x, ID.value(i));
+ }
+ for (; x < len; x++)
+ fprintf(stdout, uint32FMTW(7)"\t"uint32FMTW(6)"\n", x, 0);
+
+ } else {
+ uint32 *depth = new uint32 [len];
+ for (x=0; x < len; x++)
+ depth[x] = 0;
+
+ for (uint32 i=0; i < ID.numberOfIntervals(); i++)
+ for (x=ID.lo(i); x < ID.hi(i); x++)
+ depth[x] = ID.count(i);
+
+ uint32 avedepth = 0;
+
+ for (x=0; x < windowsize; x++)
+ avedepth += depth[x];
+
+ while (x < len) {
+ uint32 avepos = (x - 1) - (windowsize - 1) / 2;
+ if ((avepos % skipsize) == 0)
+ fprintf(stdout, uint32FMT"\t%.4f\n",
+ avepos,
+ (double)avedepth / (double)windowsize);
+
+ avedepth = avedepth + depth[x] - depth[x-windowsize];
+
+ x++;
+ }
+
+ delete [] depth;
+ }
+
+ delete [] id;
+
+ delete MS;
+ delete S;
+ }
+
+
+ delete F;
+ delete E;
+}
diff --git a/meryl/mapMers.C b/meryl/mapMers.C
new file mode 100644
index 0000000..98b3edd
--- /dev/null
+++ b/meryl/mapMers.C
@@ -0,0 +1,210 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bio++.H"
+#include "seqCache.H"
+#include "merStream.H"
+#include "libmeryl.H"
+#include "existDB.H"
+
+#define OP_NONE 0
+#define OP_STATS 1
+#define OP_REGIONS 2
+#define OP_DETAILS 3
+
+int
+main(int argc, char **argv) {
+ uint32 merSize = 16;
+ char *merylFile = 0L;
+ char *fastaFile = 0L;
+ bool beVerbose = false;
+ uint32 loCount = 0;
+ uint32 hiCount = ~uint32ZERO;
+ uint32 operation = OP_NONE;
+
+ // For OP_STATS
+
+ uint32 Clen = 0;
+ uint32 Cmax = 4 * 1024 * 1024;
+ uint32 *C = new uint32 [Cmax];
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-m") == 0) {
+ merSize = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-mers") == 0) {
+ merylFile = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-seq") == 0) {
+ fastaFile = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-v") == 0) {
+ beVerbose = true;
+
+ } else if (strcmp(argv[arg], "-lo") == 0) {
+ loCount = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-hi") == 0) {
+ hiCount = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-stats") == 0) {
+ operation = OP_STATS;
+
+ } else if (strcmp(argv[arg], "-regions") == 0) {
+ operation = OP_REGIONS;
+
+ } else if (strcmp(argv[arg], "-details") == 0) {
+ operation = OP_DETAILS;
+
+ } else {
+ fprintf(stderr, "unknown option '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ if ((operation == OP_NONE) || (merylFile == 0L) || (fastaFile == 0L)) {
+ fprintf(stderr, "usage: %s [-stats | -regions | -details] -m mersize -mers mers -seq fasta > output\n", argv[0]);
+ exit(1);
+ }
+
+#if 0
+ existDB *E = NULL;
+
+ if (fileExists("junk.existDB")) {
+ fprintf(stderr, "loading from junk.existDB\n");
+ E = new existDB("junk.existDB");
+ fprintf(stderr, "loaded\n");
+ } else {
+ exit(1);
+ E = new existDB(merylFile, merSize, existDBcounts, loCount, hiCount);
+ E->saveState("junk.existDB");
+ }
+#endif
+
+ existDB *E = new existDB(merylFile, merSize, existDBcounts, loCount, hiCount);
+ seqCache *F = new seqCache(fastaFile);
+
+ fprintf(stderr, "Begin.\n");
+
+
+ for (uint32 Sid=0; Sid < F->getNumberOfSequences(); Sid++) {
+ seqInCore *S = F->getSequenceInCore(Sid);
+ merStream *MS = new merStream(new kMerBuilder(merSize),
+ new seqStream(S->sequence(), S->sequenceLength()),
+ true, true);
+
+ // with counts, report mean, mode, median, min, max for each frag.
+ if (operation == OP_STATS) {
+ Clen = 0;
+ for (uint32 i=0; i<Cmax; i++)
+ C[i] = 0;
+
+ while (MS->nextMer()) {
+ uint64 cnt = E->count(MS->theFMer()) + E->count(MS->theRMer());
+
+ if (cnt > 0)
+ C[Clen++] = cnt;
+ }
+
+ uint64 mean = uint64ZERO;
+ uint64 min = ~uint64ZERO;
+ uint64 max = uint64ZERO;
+ uint64 hist[16] = { 0 };
+
+ // Histogram values are powers of two, e.g., <=1, <=2, <=4, <=8, <=16, <=32, <=64, <=128, <=256, <=512, <=1024, <=4096, <=8192, <=328768
+
+ for (uint32 i=0; i<Clen; i++) {
+ mean += C[i];
+
+ if ((min > C[i]) && (C[i] > 1))
+ min = C[i];
+ if (max < C[i])
+ max = C[i];
+
+ hist[ logBaseTwo64(C[i]) ]++;
+ }
+
+ mean /= Clen;
+
+ fprintf(stdout,
+ "%s\t"
+ uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"
+ uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"
+ uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\n",
+ S->header(),
+ mean, min, max,
+ hist[ 0], hist[ 1], hist[ 2], hist[ 3], hist[ 4], hist[ 5], hist[ 6], hist[ 7],
+ hist[ 8], hist[ 9], hist[10], hist[11], hist[12], hist[13], hist[14], hist[15]);
+ }
+
+
+ // without counts, reports regions with mer coverage.
+ // Orientation tells us nothing, since the mers are probably canonical
+ if (operation == OP_REGIONS) {
+ uint64 beg = ~uint64ZERO;
+ uint64 end = ~uint64ZERO;
+ uint64 pos = ~uint64ZERO;
+
+ uint64 numCovReg = 0;
+ uint64 lenCovReg = 0;
+
+ while (MS->nextMer()) {
+ if (E->exists(MS->theFMer()) || E->exists(MS->theRMer())) {
+ pos = MS->thePositionInSequence();
+
+ if (beg == ~uint64ZERO)
+ beg = end = pos;
+
+ if (pos <= end + merSize) {
+ end = pos;
+ } else {
+ fprintf(stdout, "%s\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\n", S->header(), beg, end+merSize, end+merSize - beg);
+ numCovReg++;
+ lenCovReg += end+merSize - beg;
+ beg = end = pos;
+ }
+ } else {
+ fprintf(stdout, "%s\t"uint64FMT"\tuncovered\n", S->header(), MS->thePositionInSequence());
+ }
+ }
+
+ if (beg != ~uint64ZERO)
+ fprintf(stdout, "%s\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\n", S->header(), beg, end+merSize, end+merSize - beg);
+
+ fprintf(stderr, "numCovReg: "uint64FMT"\n", numCovReg);
+ fprintf(stderr, "lenCovReg: "uint64FMT"\n", lenCovReg);
+ }
+
+
+
+ if (operation == OP_DETAILS) {
+ char merString[256];
+
+ while (MS->nextMer()) {
+ uint64 beg = MS->thePositionInSequence();
+ uint64 end = beg + merSize;
+ uint64 fnt = E->count(MS->theFMer());
+ uint64 rnt = E->count(MS->theRMer());
+
+ fprintf(stdout, "%s\t%s\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\t"uint64FMT"\n",
+ S->header(),
+ MS->theFMer().merToString(merString),
+ beg,
+ end,
+ fnt,
+ rnt,
+ fnt + rnt);
+ }
+ }
+
+
+ delete MS;
+ delete S;
+ }
+
+
+ delete F;
+ delete E;
+}
diff --git a/meryl/maskMers.C b/meryl/maskMers.C
new file mode 100644
index 0000000..a717054
--- /dev/null
+++ b/meryl/maskMers.C
@@ -0,0 +1,591 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "bio++.H"
+#include "seqStream.H"
+#include "libmeryl.H"
+
+#include <algorithm>
+
+#define MAX_COVERAGE 51
+
+class mateRescueData {
+public:
+ mateRescueData() {
+ _mean = 0;
+ _stddev = 0;
+ _coverage = 0;
+ _normal = 0L;
+ _normalZero = 0;
+ };
+
+ void init(int32 mean_, int32 stddev_, uint32 coverage_) {
+ _mean = mean_;
+ _stddev = stddev_;
+ _coverage = coverage_;
+
+ assert(_mean > 3 * _stddev);
+
+ double a = 1.0 / (_stddev * sqrt(2 * M_PI));
+ double c = 2 * _stddev * _stddev;
+
+ int32 b1l = (int32)floor(-3 * _stddev);
+ int32 b1h = (int32)ceil ( 3 * _stddev);
+
+ _normal = new double [b1h - b1l + 1];
+ _normalZero = -b1l;
+
+ for (int32 l=0; l<b1h - b1l + 1; l++)
+ _normal[l] = 0.0;
+
+ for (int32 l=b1l; l<b1h; l++)
+ _normal[l + _normalZero] = a * exp(- l*l / c);
+ };
+ ~mateRescueData() {
+ };
+
+ int32 mean(void) { return(_mean); };
+ int32 stddev(void) { return(_stddev); };
+ uint32 coverage(void) { return(_coverage); };
+
+ double normal(int32 p) { return(_normal[p + _normalZero]); };
+
+private:
+ int32 _mean;
+ int32 _stddev;
+ uint32 _coverage;
+
+ double *_normal;
+ int32 _normalZero;
+};
+
+
+class merMaskedSequence {
+public:
+ merMaskedSequence(char *fastaName_, char *merylName_, uint32 onlySeqIID_=~uint32ZERO) {
+ _numSeq = 0;
+ _seqLen = 0L;
+ _masking = 0L;
+ _repeatID = 0L;
+ _merSize = 0;
+
+ strcpy(_fastaName, fastaName_);
+ strcpy(_merylName, merylName_);
+
+ strcpy(_maskMersName, _merylName);
+ strcat(_maskMersName, ".maskMers");
+
+ if (fileExists(_maskMersName))
+ loadMasking(onlySeqIID_);
+ else
+ buildMasking();
+ };
+ ~merMaskedSequence() {
+ delete [] _seqLen;
+ for (uint32 i=0; i<_numSeq; i++) {
+ delete [] _masking[i];
+ delete [] _repeatID[i];
+ }
+ delete [] _masking;
+ delete [] _repeatID;
+ };
+
+public:
+ uint32 numSeq(void) { return(_numSeq); };
+ int32 seqLen(uint32 i) { return(_seqLen[i]); };
+ char masking(uint32 s, uint32 p) { return(_masking[s][p]); };
+ uint32 repeatID(uint32 s, uint32 p) { return(_repeatID[s][p]); };
+
+ uint32 merSize(void) { return(_merSize); };
+
+private:
+ void loadMasking(uint32 onlySeqIID_=~uint32ZERO); // Read the masking from the saved file
+ void saveMasking(void); // Write the masking to a file
+ void buildMasking(void); // Read the mers to build the masking
+
+ uint32 _numSeq;
+ int32 *_seqLen; // signed just for convenience later (positions are signed for same reason)
+ char **_masking;
+ uint32 **_repeatID;
+
+ uint32 _merSize;
+
+ char _fastaName[FILENAME_MAX];
+ char _merylName[FILENAME_MAX];
+ char _maskMersName[FILENAME_MAX];
+};
+
+
+void
+merMaskedSequence::loadMasking(uint32 onlySeqIID_) {
+ FILE *maskMersFile = fopen(_maskMersName, "r");
+
+ fread(&_numSeq, sizeof(uint32), 1, maskMersFile);
+ fread(&_merSize, sizeof(uint32), 1, maskMersFile);
+
+ _seqLen = new int32 [_numSeq];
+ _masking = new char * [_numSeq];
+ _repeatID = new uint32 * [_numSeq];
+
+ fprintf(stderr, uint32FMT" sequences in '%s'\n", _numSeq, _fastaName);
+
+ fread( _seqLen, sizeof(uint32), _numSeq, maskMersFile);
+
+ for (uint32 i=0; i<_numSeq; i++) {
+ _masking[i] = 0L;
+ _repeatID[i] = 0L;
+
+ if ((onlySeqIID_ >= _numSeq) || (onlySeqIID_ == i)) {
+ fprintf(stderr, "Loading sequence "uint32FMT" of length "uint32FMT"\n", i, _seqLen[i]);
+
+ _masking[i] = new char [_seqLen[i]];
+ _repeatID[i] = new uint32 [_seqLen[i]];
+
+ //memset(_masking[i], 'g', sizeof(char) * _seqLen[i]);
+ //memset(_repeatID[i], 0, sizeof(uint32) * _seqLen[i]);
+
+ fread(_masking[i], sizeof(char), _seqLen[i], maskMersFile);
+ fread(_repeatID[i], sizeof(uint32), _seqLen[i], maskMersFile);
+ } else {
+ fseek(maskMersFile, sizeof(char) * _seqLen[i], SEEK_CUR);
+ fseek(maskMersFile, sizeof(uint32) * _seqLen[i], SEEK_CUR);
+ _seqLen[i] = 0;
+ }
+ }
+
+ fclose(maskMersFile);
+}
+
+
+void
+merMaskedSequence::saveMasking(void) {
+ FILE *maskMersFile = fopen(_maskMersName, "w");
+
+ fwrite(&_numSeq, sizeof(uint32), 1, maskMersFile);
+ fwrite(&_merSize, sizeof(uint32), 1, maskMersFile);
+ fwrite( _seqLen, sizeof(uint32), _numSeq, maskMersFile);
+
+ for (uint32 i=0; i<_numSeq; i++) {
+ fwrite(_masking[i], sizeof(char), _seqLen[i], maskMersFile);
+ fwrite(_repeatID[i], sizeof(uint32), _seqLen[i], maskMersFile);
+ }
+
+ fclose(maskMersFile);
+}
+
+
+void
+merMaskedSequence::buildMasking(void) {
+ seqStream *STR = new seqStream(_fastaName);
+
+ _numSeq = STR->numberOfSequences();
+
+ _seqLen = new int32 [_numSeq];
+ _masking = new char * [_numSeq];
+ _repeatID = new uint32 * [_numSeq];
+
+ _merSize = 0;
+
+ fprintf(stderr, uint32FMT" sequences in '%s'\n", _numSeq, _fastaName);
+
+ for (uint32 i=0; i<_numSeq; i++) {
+ _seqLen[i] = STR->lengthOf(i);
+
+ _masking[i] = new char [_seqLen[i]];
+ _repeatID[i] = new uint32 [_seqLen[i]];
+
+ memset(_masking[i], 'g', sizeof(char) * _seqLen[i]);
+ memset(_repeatID[i], 0, sizeof(uint32) * _seqLen[i]);
+ }
+
+ // g -> gap in sequence
+ // u -> unique mer
+ // r -> repeat mer
+ //
+ // For all the r's we also need to remember the other locations
+ // that repeat is at. We annotate the map with a repeat id, set if
+ // another copy of the repeat is nearby.
+
+ merylStreamReader *MS = new merylStreamReader(_merylName);
+ speedCounter *CT = new speedCounter(" Masking mers in sequence: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, true);
+
+ uint32 rid = 0;
+
+ _merSize = MS->merSize();
+
+ while (MS->nextMer()) {
+ //fprintf(stderr, "mer count="uint64FMT" pos="uint32FMT"\n", MS->theCount(), MS->getPosition(0));
+
+ if (MS->theCount() == 1) {
+ uint32 p = MS->getPosition(0);
+ uint32 s = STR->sequenceNumberOfPosition(p);
+ p -= STR->startOf(s);
+
+ _masking[s][p] = 'u';
+ } else {
+ std::sort(MS->thePositions(), MS->thePositions() + MS->theCount());
+
+ uint32 lastS = ~uint32ZERO;
+ uint32 lastP = 0;
+
+ rid++;
+
+ for (uint32 i=0; i<MS->theCount(); i++) {
+ uint32 p = MS->getPosition(i);
+ uint32 s = STR->sequenceNumberOfPosition(p);
+ p -= STR->startOf(s);
+
+ // Always set the masking.
+ _masking[s][p] = 'r';
+
+ // If there is a repeat close by, set the repeat ID.
+ if ((s == lastS) && (lastP + 40000 > p)) {
+ _repeatID[s][lastP] = rid;
+ _repeatID[s][p] = rid;
+ }
+
+ lastS = s;
+ lastP = p;
+ }
+ }
+
+ CT->tick();
+ }
+
+ delete CT;
+
+ delete MS;
+
+ delete STR;
+
+ saveMasking();
+}
+
+
+void
+computeDensity(merMaskedSequence *S, char *outputPrefix) {
+ char outputName[FILENAME_MAX];
+ FILE *outputFile;
+ uint32 windowSizeMax = 10000;
+
+ for (uint32 s=0; s<S->numSeq(); s++) {
+
+ // seqLen == 0 iff that sequence is not loaded.
+ if (S->seqLen(s) == 0)
+ continue;
+
+ sprintf(outputName, "%s.density.seq"uint32FMTW(02), outputPrefix, s);
+ outputFile = fopen(outputName, "w");
+
+ fprintf(stderr, "Starting '%s'\n", outputName);
+
+ fprintf(outputFile, "#window\tunique\trepeat\tgaps\n");
+
+ // Not the most efficient, but good enough for us right now.
+
+ for (int32 p=0; p<S->seqLen(s); ) {
+ uint32 windowSize = 0;
+ uint32 uniqueSum = 0;
+ uint32 repeatSum = 0;
+ uint32 gapSum = 0;
+
+ while ((windowSize < windowSizeMax) &&
+ (p < S->seqLen(s))) {
+ char m = S->masking(s, p);
+
+ if (m == 'u') uniqueSum++;
+ if (m == 'g') gapSum++;
+ if (m == 'r') repeatSum++;
+
+ windowSize++;
+ p++;
+ }
+
+ fprintf(outputFile, uint32FMT"\t%f\t%f\t%f\n",
+ p - windowSize,
+ (double)uniqueSum / windowSize,
+ (double)repeatSum / windowSize,
+ (double)gapSum / windowSize);
+ }
+
+ fclose(outputFile);
+ }
+}
+
+
+// For each 'r' mer, compute the number of 'u' mers
+// that are within some mean +- stddev range.
+//
+// We count for two blocks:
+//
+// | <- mean -> | <- mean -> |
+// ---[block1]---------------mer---------------[block2]---
+//
+// Once we know that, we can compute the probability that
+// a repeat mer can be rescued.
+//
+// p1 = uniq/total -- for 1 X coverage
+// pn = 1 - (1-p1)^n -- for n X coverage
+
+
+void
+computeMateRescue(merMaskedSequence *S, char *outputPrefix, mateRescueData *lib, uint32 libLen) {
+ char outputName[FILENAME_MAX];
+ FILE *outputFile;
+ FILE *outputData;
+
+ uint32 closeRepeatsLen = 0;
+ uint32 closeRepeatsMax = 80000;
+ int32 *closeRepeats = new int32 [closeRepeatsMax];
+
+ speedCounter *CT = new speedCounter(" Examining repeats: %7.2f Kbases -- %5.2f Kbases/second\r", 1000.0, 0x1ffff, true);
+
+ uint32 totalDepth = 0;
+ for (uint32 l=0; l<libLen; l++)
+ totalDepth += lib[l].coverage();
+
+ for (uint32 s=0; s<S->numSeq(); s++) {
+
+ // seqLen == 0 iff that sequence is not loaded.
+ if (S->seqLen(s) == 0)
+ continue;
+
+ fprintf(stderr, "Starting sequence "uint32FMT"\n", s);
+
+ sprintf(outputName, "%s.mateRescue.seq"uint32FMTW(02)".out", outputPrefix, s);
+ outputFile = fopen(outputName, "w");
+
+ sprintf(outputName, "%s.mateRescue.seq"uint32FMTW(02)".dat", outputPrefix, s);
+ outputData = fopen(outputName, "w");
+
+ double numRR[MAX_COVERAGE] = {0}; // num repeats rescued (expected) for [] X coverage
+ double numNR[MAX_COVERAGE] = {0}; // num repeats nonrescuable (expected) for [] X coverage
+
+ uint32 numRT = 0; // num repeats total
+
+ for (int32 p=0; p<S->seqLen(s); p++) {
+ CT->tick();
+
+ double pRtot = 0.0;
+ double pFtot = 0.0;
+
+ if ((S->masking(s, p) != 'g') &&
+ (S->masking(s, p) != 'u') &&
+ (S->masking(s, p) != 'r'))
+ fprintf(stderr, "INVALID MASKING - got %d = %c\n", S->masking(s, p), S->masking(s, p));
+
+
+ if (S->masking(s, p) == 'r') {
+ numRT++;
+
+ // Index over x-coverage in libraries. MUST BE 1.
+ uint32 ridx = 1;
+
+ for (uint32 l=0; l<libLen; l++) {
+ int32 mean = lib[l].mean();
+ int32 stddev = lib[l].stddev();
+
+ // Build a list of the same repeat close to this guy.
+ closeRepeatsLen = 0;
+
+ if (S->repeatID(s, p) > 0) {
+ int32 pl = (int32)floor(p - 3 * stddev);
+ int32 ph = (int32)ceil (p + 3 * stddev);
+
+ if (pl < 0) pl = 0;
+ if (ph > S->seqLen(s)) ph = S->seqLen(s);
+
+ for (int32 pi=pl; pi<ph; pi++)
+ if ((S->repeatID(s, pi) == S->repeatID(s, p)) && (pi != p))
+ closeRepeats[closeRepeatsLen++] = pi;
+ }
+
+
+ int32 b1l = (int32)floor(p - mean - 3 * stddev);
+ int32 b1h = (int32)ceil (p - mean + 3 * stddev);
+
+ int32 b2l = (int32)floor(p + mean - 3 * stddev);
+ int32 b2h = (int32)ceil (p + mean + 3 * stddev);
+
+ if (b1l < 0) b1l = 0;
+ if (b1h < 0) b1h = 0;
+ if (b1h > S->seqLen(s)) b1h = S->seqLen(s);
+
+ if (b2l < 0) b2l = 0;
+ if (b2h > S->seqLen(s)) b2h = S->seqLen(s);
+ if (b2l > S->seqLen(s)) b2l = S->seqLen(s);
+
+ //fprintf(stderr, "b1: %d-%d b2:%d-%d\n", b1l, b1h, b2l, b2h);
+
+ // probability we can rescue this repeat with this mate pair
+ double pRescue = 0.0;
+ double pFailed = 0.0;
+
+ if (closeRepeatsLen == 0) {
+ // No close repeats, use the fast method.
+ for (int32 b=b1l; b<b1h; b++) {
+ if (S->masking(s, b) == 'u')
+ pRescue += lib[l].normal(b - p + mean);
+ }
+
+ for (int32 b=b2l; b<b2h; b++) {
+ if (S->masking(s, b) == 'u')
+ pRescue += lib[l].normal(b - p - mean);
+ }
+ } else {
+ // Close repeats, gotta be slow.
+ for (int32 b=b1l; b<b1h; b++) {
+ if (S->masking(s, b) == 'u') {
+ int32 mrl = b + mean - 3 * stddev;
+ int32 mrh = b + mean + 3 * stddev;
+
+ bool rescuable = true;
+
+ for (uint32 cri=0; rescuable && cri<closeRepeatsLen; cri++)
+ if ((mrl <= closeRepeats[cri]) && (closeRepeats[cri] <= mrh))
+ rescuable = false;
+
+ if (rescuable)
+ pRescue += lib[l].normal(b - p + mean);
+ else
+ pFailed += lib[l].normal(b - p + mean);
+ }
+ }
+
+ for (int32 b=b2l; b<b2h; b++) {
+ if (S->masking(s, b) == 'u') {
+ int32 mrl = b - mean - 3 * stddev;
+ int32 mrh = b - mean + 3 * stddev;
+
+ bool rescuable = true;
+
+ for (uint32 cri=0; rescuable && cri<closeRepeatsLen; cri++)
+ if ((mrl <= closeRepeats[cri]) && (closeRepeats[cri] <= mrh))
+ rescuable = false;
+
+ if (rescuable)
+ pRescue += lib[l].normal(b - p - mean);
+ else
+ pFailed += lib[l].normal(b - p - mean);
+ }
+ }
+ }
+
+ // We're summing over two distributions.
+ pRescue /= 2.0;
+ pFailed /= 2.0;
+
+ // Compute probability of rescuing with libraries we've
+ // seen already, and the expected number of repeats
+ // rescued.
+ //
+ // We keep track of the probability we rescue this repeat
+ // with additional coverage of libraries. First 1x of the
+ // first lib, then 2x of the first, etc, etc.
+ //
+ {
+ double pR = 1.0;
+ double pF = 1.0;
+ for (uint32 x=0; x<lib[l].coverage(); x++) {
+ // Makes it here. pRescue != 1.0
+ pR *= (1.0 - pRescue);
+ numRR[ridx] += 1 - pR;
+ pRtot += 1 - pR;
+
+ pF *= (1.0 - pFailed);
+ numNR[ridx] += 1 - pF;
+ pFtot += 1 - pF;
+
+ ridx++;
+ }
+ }
+ } // over all libs
+
+ fprintf(outputData, int32FMT"\t%f\t%f\n", p, pRtot / totalDepth, pFtot / totalDepth);
+
+ } // if masking is r
+ } // over all positions
+
+ fprintf(outputFile, "seqIID\tmerSize\ttRepeat\teRescue\teFailed\tXcov\tmean\tstddev\n");
+
+ for (uint32 x=1, l=0, n=0; l<libLen; x++) {
+ fprintf(outputFile, uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%.0f\t%.0f\t"uint32FMT"\t"int32FMT"\t"int32FMT"\n",
+ s, S->merSize(), numRT, numRR[x], numNR[x], x, lib[l].mean(), lib[l].stddev());
+ n++;
+ if (n >= lib[l].coverage()) {
+ l++;
+ n = 0;
+ }
+ }
+
+ fclose(outputFile);
+ fclose(outputData);
+ }
+
+ delete CT;
+}
+
+
+
+int
+main(int argc, char **argv) {
+ char *merylName = 0L;
+ char *fastaName = 0L;
+ char *outputPrefix = 0L;
+
+ uint32 onlySeqIID = ~uint32ZERO;
+
+ bool doDensity = false;
+ bool doRescue = false;
+
+ mateRescueData lib[MAX_COVERAGE];
+ uint32 libLen = 0;
+
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-mers") == 0) {
+ merylName = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-seq") == 0) {
+ fastaName = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-only") == 0) {
+ onlySeqIID = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-output") == 0) {
+ outputPrefix = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-d") == 0) {
+ doDensity = true;
+
+ } else if (strcmp(argv[arg], "-r") == 0) {
+ if (atoi(argv[arg+3]) > 0) {
+ doRescue = true;
+ lib[libLen++].init(atoi(argv[arg+1]), atoi(argv[arg+2]), atoi(argv[arg+3]));
+ }
+ arg += 3;
+
+ } else {
+ fprintf(stderr, "unknown option '%s'\n", argv[arg]);
+ err++;
+ }
+ arg++;
+ }
+ if ((err) || (merylName == 0L) || (fastaName == 0L) || (outputPrefix == 0L)) {
+ fprintf(stderr, "usage: %s -mers mers -seq fasta -output prefix [-d] [-r mean stddev coverage]\n", argv[0]);
+ exit(1);
+ }
+
+ merMaskedSequence *S = new merMaskedSequence(fastaName, merylName, onlySeqIID);
+
+ if (doDensity)
+ computeDensity(S, outputPrefix);
+
+ if (doRescue)
+ computeMateRescue(S, outputPrefix, lib, libLen);
+
+ return(0);
+}
diff --git a/meryl/merge.C b/meryl/merge.C
new file mode 100644
index 0000000..1931117
--- /dev/null
+++ b/meryl/merge.C
@@ -0,0 +1,240 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "meryl.H"
+#include "libmeryl.H"
+
+
+
+void
+multipleOperations(merylArgs *args) {
+
+ if (args->mergeFilesLen < 2) {
+ fprintf(stderr, "ERROR - must have at least two databases (you gave "uint32FMT")!\n", args->mergeFilesLen);
+ exit(1);
+ }
+ if (args->outputFile == 0L) {
+ fprintf(stderr, "ERROR - no output file specified.\n");
+ exit(1);
+ }
+ if ((args->personality != PERSONALITY_MERGE) &&
+ (args->personality != PERSONALITY_MIN) &&
+ (args->personality != PERSONALITY_MINEXIST) &&
+ (args->personality != PERSONALITY_MAX) &&
+ (args->personality != PERSONALITY_MAXEXIST) &&
+ (args->personality != PERSONALITY_ADD) &&
+ (args->personality != PERSONALITY_AND) &&
+ (args->personality != PERSONALITY_NAND) &&
+ (args->personality != PERSONALITY_OR) &&
+ (args->personality != PERSONALITY_XOR)) {
+ fprintf(stderr, "ERROR - only personalities min, minexist, max, maxexist, add, and, nand, or, xor\n");
+ fprintf(stderr, "ERROR - are supported in multipleOperations(). (%d)\n", args->personality);
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ }
+
+ merylStreamReader **R = new merylStreamReader* [args->mergeFilesLen];
+ merylStreamWriter *W = 0L;
+
+ // Open the input files, read in the first mer
+ //
+ for (uint32 i=0; i<args->mergeFilesLen; i++) {
+ R[i] = new merylStreamReader(args->mergeFiles[i]);
+ R[i]->nextMer();
+ }
+
+ // Verify that the mersizes are all the same
+ //
+ bool fail = false;
+ uint32 merSize = R[0]->merSize();
+ uint32 merComp = R[0]->merCompression();
+
+ for (uint32 i=0; i<args->mergeFilesLen; i++) {
+ fail |= (merSize != R[i]->merSize());
+ fail |= (merComp != R[i]->merCompression());
+ }
+
+ if (fail)
+ fprintf(stderr, "ERROR: mer sizes (or compression level) differ.\n"), exit(1);
+
+ // Open the output file, using the largest prefix size found in the
+ // input/mask files.
+ //
+ uint32 prefixSize = 0;
+ for (uint32 i=0; i<args->mergeFilesLen; i++)
+ if (prefixSize < R[i]->prefixSize())
+ prefixSize = R[i]->prefixSize();
+
+ W = new merylStreamWriter(args->outputFile, merSize, merComp, prefixSize, args->positionsEnabled);
+
+ // We will find the smallest mer in any file, and count the number of times
+ // it is present in the input files.
+
+ bool moreInput = true;
+
+ kMer currentMer; // The current mer we're operating on
+ uint32 currentCount = uint32ZERO; // The count (operation dependent) of this mer
+ uint32 currentTimes = uint32ZERO; // Number of files it's in
+
+ uint32 currentPositionsMax = 0;
+ uint32 *currentPositions = 0L;
+
+ kMer thisMer; // The mer we just read
+ uint32 thisFile = ~uint32ZERO; // The file we read it from
+ uint32 thisCount = uint32ZERO; // The count of the mer we just read
+
+ speedCounter *C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, args->beVerbose);
+
+ currentMer.setMerSize(merSize);
+ thisMer.setMerSize(merSize);
+
+ while (moreInput) {
+
+ // Find the smallest mer present in any input file.
+ //
+ moreInput = false;
+ thisMer.clear();
+ thisFile = ~uint32ZERO;
+ thisCount = uint32ZERO;
+
+ // Load thisMer with the first valid mer
+ for (uint32 i=0; i<args->mergeFilesLen && !moreInput; i++)
+ if (R[i]->validMer()) {
+ moreInput = true;
+ thisCount = R[i]->theCount();
+ thisFile = i;
+ thisMer = R[i]->theFMer();
+ }
+
+ // Now find the smallest one
+ if (moreInput) {
+ for (uint32 i=thisFile+1; i<args->mergeFilesLen; i++)
+ if ((R[i]->validMer()) && (R[i]->theFMer()) < thisMer) {
+ moreInput = true;
+ thisCount = R[i]->theCount();
+ thisFile = i;
+ thisMer = R[i]->theFMer();
+ }
+ }
+
+ // If we've hit a different mer, write out the last one
+ if ((moreInput == false) || (thisMer != currentMer)) {
+ switch (args->personality) {
+ case PERSONALITY_MIN:
+ case PERSONALITY_MAX:
+ if (currentTimes == args->mergeFilesLen)
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_MERGE:
+ case PERSONALITY_MINEXIST:
+ case PERSONALITY_MAXEXIST:
+ case PERSONALITY_ADD:
+ W->addMer(currentMer, currentCount, currentPositions);
+ break;
+ case PERSONALITY_AND:
+ if (currentTimes == args->mergeFilesLen)
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_NAND:
+ if (currentTimes != args->mergeFilesLen)
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_OR:
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_XOR:
+ if ((currentTimes % 2) == 1)
+ W->addMer(currentMer, currentCount);
+ break;
+ default:
+ fprintf(stderr, "ERROR - invalid personality in multipleOperations::write\n");
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ break;
+ }
+
+ currentMer = thisMer;
+
+ currentCount = uint32ZERO;
+ currentTimes = uint32ZERO;
+
+ C->tick();
+ }
+
+ // All done? Exit.
+ if (moreInput == false)
+ continue;
+
+ // Perform the operation
+ switch (args->personality) {
+ case PERSONALITY_MERGE:
+ if (R[thisFile]->thePositions()) {
+ if (currentPositionsMax == 0) {
+ currentPositionsMax = 1048576;
+ currentPositions = new uint32 [currentPositionsMax];
+ }
+
+ if (currentPositionsMax < currentCount + thisCount) {
+ while (currentPositionsMax < currentCount + thisCount)
+ currentPositionsMax *= 2;
+
+ uint32 *t = new uint32 [currentPositionsMax];
+ memcpy(t, currentPositions, sizeof(uint32) * currentCount);
+ delete [] currentPositions;
+ currentPositions = t;
+ }
+
+ if (thisCount < 16) {
+ uint32 *p = R[thisFile]->thePositions();
+ for (uint32 i=0; i<thisCount; i++)
+ currentPositions[currentCount + i] = p[i];
+ } else {
+ memcpy(currentPositions + currentCount, R[thisFile]->thePositions(), sizeof(uint32) * thisCount);
+ }
+ }
+ // Otherwise, we're the same as ADD.
+ currentCount += thisCount;
+ break;
+ case PERSONALITY_MIN:
+ case PERSONALITY_MINEXIST:
+ if (currentTimes == 0) {
+ currentCount = thisCount;
+ } else {
+ if (currentCount > thisCount)
+ currentCount = thisCount;
+ }
+ break;
+ case PERSONALITY_MAX:
+ case PERSONALITY_MAXEXIST:
+ if (currentCount < thisCount)
+ currentCount = thisCount;
+ break;
+ case PERSONALITY_ADD:
+ currentCount += thisCount;
+ break;
+ case PERSONALITY_AND:
+ case PERSONALITY_NAND:
+ case PERSONALITY_OR:
+ case PERSONALITY_XOR:
+ currentCount = 1;
+ break;
+ default:
+ fprintf(stderr, "ERROR - invalid personality in multipleOperations::operate\n");
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ break;
+ }
+
+ currentTimes++;
+
+ // Move the file we just read from to the next mer
+ R[thisFile]->nextMer();
+ }
+
+ for (uint32 i=0; i<args->mergeFilesLen; i++)
+ delete R[i];
+ delete R;
+ delete W;
+ delete C;
+}
diff --git a/meryl/merge.listmerge.C b/meryl/merge.listmerge.C
new file mode 100644
index 0000000..47424c5
--- /dev/null
+++ b/meryl/merge.listmerge.C
@@ -0,0 +1,447 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "meryl.H"
+#include "libmeryl.H"
+
+
+using namespace std;
+
+#include <algorithm>
+
+struct mMer {
+ kMer _mer;
+ uint32 _cnt;
+ uint32 _off;
+ uint32 _nxt;
+ uint32 _stp;
+};
+
+
+class mMerList {
+public:
+ mMerList(uint32 maxSize) {
+ _posLen = 0;
+ _posMax = 2 * maxSize;
+ _pos = new uint32 [_posMax];
+
+ _mmmLen = 0;
+ _mmmMax = maxSize;
+ _mmm = new mMer [_mmmMax];
+
+ _tip = ~uint32ZERO;
+ _fre = 0;
+
+ for (uint32 i=0; i<_mmmMax; i++) {
+ _mmm[i]._cnt = 0;
+ _mmm[i]._off = 0;
+ _mmm[i]._nxt = i+1;
+ _mmm[i]._stp = 0;
+ }
+
+ _mmm[_mmmMax-1]._nxt = ~uint32ZERO;
+ };
+ ~mMerList() {
+ delete [] _pos;
+ delete [] _mmm;
+ };
+
+ bool loadMore(void) { return((_mmmMax < _tip) || (_mmm[_tip]._stp == 1)); };
+
+ uint32 length(void) { return(_mmmLen); };
+
+ kMer *pop(uint32 &cnt, uint32* &pos) {
+ kMer *ret = 0L;
+
+ //fprintf(stderr, "POP tip="uint32FMT"\n", _tip);
+
+ if (_tip < _mmmMax) {
+ uint32 f = _tip;
+
+ ret = &_mmm[f]._mer;
+ cnt = _mmm[f]._cnt;
+ pos = (_mmm[f]._off != ~uint32ZERO) ? _pos + _mmm[f]._off : 0L;
+
+ // Move tip to the next thing
+ _tip = _mmm[f]._nxt;
+
+ // And append this one to the free list.
+ _mmm[f]._nxt = _fre;
+ _fre = f;
+
+ _mmmLen--;
+
+ //fprintf(stderr, "POP f="uint32FMT" tip="uint32FMT" len="uint32FMT"\n", f, _tip, _mmmLen);
+ }
+
+ return(ret);
+ };
+
+
+ // rebuild the position list, squeezes out empty items
+ void rebuild(void) {
+ if (_posLen > 0) {
+ assert(0);
+ uint32 *np = new uint32 [_posMax];
+
+ _posLen = 0;
+
+ for (uint32 i=0; i<_mmmLen; i++) {
+ mMer *m = _mmm + i;
+
+ if (m->_off != ~uint32ZERO) {
+ _mmm[_mmmLen]._off = _posLen;
+
+ for (uint32 p=0; p<m->_cnt; p++, _posLen++)
+ np[_posLen] = _pos[p];
+ }
+ }
+
+ delete [] _pos;
+ _pos = np;
+ }
+ };
+
+
+
+ // Read more mers from the file
+ void read(merylStreamReader *R, uint32 num, bool loadAll) {
+ uint32 xxx = 0;
+ uint32 las = ~uint32ZERO;
+ uint32 pos = _tip;
+ bool stop = false;
+
+ //fprintf(stderr, "read()- loading "uint32FMT"\n", num);
+
+ assert(_mmmLen + num < _mmmMax);
+
+ // Load until we hit the sentinal.
+ if (loadAll == false)
+ num = ~uint32ZERO;
+
+ for (xxx=0; (xxx < num) && (stop == false) && (R->nextMer()); xxx++) {
+
+ // Insert into a free node
+ uint32 fre = _fre;
+ _fre = _mmm[fre]._nxt;
+
+ _mmm[fre]._mer = R->theFMer();
+ _mmm[fre]._cnt = R->theCount();
+ _mmm[fre]._off = ~uint32ZERO;
+ _mmm[fre]._stp = 0;
+
+ uint32 *ppp = R->thePositions();
+ if (ppp) {
+ _mmm[fre]._off = _posLen;
+
+ if (_posMax <= _posLen + _mmm[fre]._cnt) {
+ fprintf(stderr, "Reallocate _pos\n");
+ _posMax *= 2;
+ uint32 *tmp = new uint32 [_posMax];
+ memcpy(tmp, _pos, sizeof(uint32) * _posLen);
+ delete [] _pos;
+ _pos = tmp;
+ }
+
+ for (uint32 i=0; i<_mmm[fre]._cnt; i++, _posLen++)
+ _pos[_posLen] = ppp[i];
+ }
+
+ // Keep count
+ _mmmLen++;
+
+ // Figure out where to put it in the list. New duplicates must
+ // go AFTER the existing -- that's the job of <=.
+
+ while ((pos < _mmmMax) && (_mmm[pos]._mer <= R->theFMer())) {
+ las = pos;
+ pos = _mmm[pos]._nxt;
+ }
+
+ if (_mmmMax < _tip) {
+ // No tip, make new list.
+ _mmm[fre]._nxt = _tip;
+ _tip = fre;
+ las = ~uint32ZERO;
+ pos = _tip;
+ } else if (_mmmMax < las) {
+ // Valid list, but we want to insert before the start
+ _mmm[fre]._nxt = _tip;
+ _tip = fre;
+ las = ~uint32ZERO;
+ pos = _tip;
+ } else if (pos < _mmmMax) {
+ // Valid pos, insert in the middle (after las, before pos)
+ _mmm[fre]._nxt = _mmm[las]._nxt;
+ _mmm[las]._nxt = fre;
+ las = fre;
+ //pos = _mmm[las]._nxt;
+ } else {
+ // Have a list, but we ran off the end, append (after las)
+ _mmm[fre]._nxt = ~uint32ZERO;
+ _mmm[las]._nxt = fre;
+ pos = fre;
+
+ if (loadAll == false)
+ stop = true;
+ }
+ }
+
+ // Set the sentinal. This forces us to load more mers.
+ //
+ if (loadAll == true) {
+ //fprintf(stderr, "read()-- stop on tip = "uint32FMT"\n", las);
+ _mmm[las]._stp = 1;
+ }
+
+ //fprintf(stderr, "read()-- now up to "uint32FMT" mers ("uint32FMT" pos); loaded "uint32FMT" out of "uint32FMT" requested.\n", _mmmLen, _posLen, xxx, num);
+ };
+
+private:
+ uint32 _posLen;
+ uint32 _posMax;
+ uint32 *_pos;
+
+ uint32 _mmmLen;
+ uint32 _mmmMax;
+ mMer *_mmm;
+
+ uint32 _tip;
+ uint32 _fre;
+};
+
+
+
+
+void
+multipleOperations(merylArgs *args) {
+
+ if (args->mergeFilesLen < 2) {
+ fprintf(stderr, "ERROR - must have at least two databases (you gave "uint32FMT")!\n", args->mergeFilesLen);
+ exit(1);
+ }
+ if (args->outputFile == 0L) {
+ fprintf(stderr, "ERROR - no output file specified.\n");
+ exit(1);
+ }
+ if ((args->personality != PERSONALITY_MERGE) &&
+ (args->personality != PERSONALITY_MIN) &&
+ (args->personality != PERSONALITY_MINEXIST) &&
+ (args->personality != PERSONALITY_MAX) &&
+ (args->personality != PERSONALITY_ADD) &&
+ (args->personality != PERSONALITY_AND) &&
+ (args->personality != PERSONALITY_NAND) &&
+ (args->personality != PERSONALITY_OR) &&
+ (args->personality != PERSONALITY_XOR)) {
+ fprintf(stderr, "ERROR - only personalities min, minexist, max, add, and, nand, or, xor\n");
+ fprintf(stderr, "ERROR - are supported in multipleOperations().\n");
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ }
+
+ uint32 maxSize = 64 * 1024 * 1024;
+
+ merylStreamReader **R = new merylStreamReader* [args->mergeFilesLen];
+ merylStreamWriter *W = 0L;
+ mMerList *M = new mMerList(maxSize + maxSize / 4);
+
+ for (uint32 i=0; i<args->mergeFilesLen; i++)
+ R[i] = new merylStreamReader(args->mergeFiles[i]);
+
+ // Verify that the mersizes are all the same
+ //
+ bool fail = false;
+ uint32 merSize = R[0]->merSize();
+ uint32 merComp = R[0]->merCompression();
+
+ for (uint32 i=0; i<args->mergeFilesLen; i++) {
+ fail |= (merSize != R[i]->merSize());
+ fail |= (merComp != R[i]->merCompression());
+ }
+
+ if (fail)
+ fprintf(stderr, "ERROR: mer size or compression level differ.\n"), exit(1);
+
+ // Open the output file, using the largest prefix size found in the
+ // input/mask files.
+ //
+ uint32 prefixSize = 0;
+ for (uint32 i=0; i<args->mergeFilesLen; i++)
+ if (prefixSize < R[i]->prefixSize())
+ prefixSize = R[i]->prefixSize();
+
+ W = new merylStreamWriter(args->outputFile, merSize, merComp, prefixSize);
+
+ // Load mers from all files, remember the largest mer we load.
+ //
+ bool loadAll = true;
+ for (uint32 i=0; i<args->mergeFilesLen; i++) {
+ M->read(R[i], maxSize / args->mergeFilesLen, loadAll);
+ loadAll = false;
+ }
+
+ fprintf(stderr, "Initial load: length="uint32FMT"\n", M->length());
+
+ bool moreStuff = true;
+
+ kMer currentMer; // The current mer we're operating on
+ uint32 currentCount = uint32ZERO; // The count (operation dependent) of this mer
+ uint32 currentTimes = uint32ZERO; // Number of files it's in
+
+ uint32 currentPositionsMax = 0;
+ uint32 *currentPositions = 0L;
+
+ kMer *thisMer; // The mer we just read
+ uint32 thisCount = uint32ZERO; // The count of the mer we just read
+ uint32 *thisPositions = 0L;
+
+ speedCounter *C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, args->beVerbose);
+
+ currentMer.setMerSize(merSize);
+
+ while (moreStuff) {
+
+ // Load more stuff if needed.
+ //
+ if (M->loadMore() == true) {
+ M->rebuild();
+
+ uint32 additionalLoading = 8192;
+
+ if (maxSize / args->mergeFilesLen > M->length())
+ additionalLoading = maxSize / args->mergeFilesLen - M->length();
+
+ loadAll = true;
+
+ for (uint32 i=0; i<args->mergeFilesLen; i++) {
+ if (R[i]->validMer()) {
+ M->read(R[i], additionalLoading, loadAll);
+ loadAll = false;
+ }
+ }
+ }
+
+ // All done? Exit.
+ if (M->length() == 0)
+ moreStuff = false;
+
+ thisMer = M->pop(thisCount, thisPositions);
+
+ // If we've hit a different mer, write out the last one
+ if ((M->length() == 0) || (*thisMer != currentMer)) {
+ switch (args->personality) {
+ case PERSONALITY_MIN:
+ if (currentTimes == args->mergeFilesLen)
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_MERGE:
+ case PERSONALITY_MINEXIST:
+ case PERSONALITY_MAX:
+ case PERSONALITY_ADD:
+ W->addMer(currentMer, currentCount, currentPositions);
+ break;
+ case PERSONALITY_AND:
+ if (currentTimes == args->mergeFilesLen)
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_NAND:
+ if (currentTimes != args->mergeFilesLen)
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_OR:
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_XOR:
+ if ((currentTimes % 2) == 1)
+ W->addMer(currentMer, currentCount);
+ break;
+ default:
+ fprintf(stderr, "ERROR - invalid personality in multipleOperations::write\n");
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ break;
+ }
+
+ currentMer = *thisMer;
+
+ currentCount = uint32ZERO;
+ currentTimes = uint32ZERO;
+
+ C->tick();
+ }
+
+
+ if (moreStuff == false)
+ break;
+
+
+ // Perform the operation
+ switch (args->personality) {
+ case PERSONALITY_MERGE:
+ if (thisPositions) {
+
+ if (currentPositionsMax == 0) {
+ currentPositionsMax = 1048576;
+ currentPositions = new uint32 [currentPositionsMax];
+ }
+
+ if (currentPositionsMax < currentCount + thisCount) {
+ while (currentPositionsMax < currentCount + thisCount)
+ currentPositionsMax *= 2;
+
+ uint32 *t = new uint32 [currentPositionsMax];
+ memcpy(t, currentPositions, sizeof(uint32) * currentCount);
+ delete [] currentPositions;
+ currentPositions = t;
+ }
+
+ if (thisCount < 16) {
+ for (uint32 i=0; i<thisCount; i++)
+ currentPositions[currentCount + i] = thisPositions[i];
+ } else {
+ memcpy(currentPositions + currentCount, thisPositions, sizeof(uint32) * thisCount);
+ }
+ }
+ // Otherwise, we're the same as ADD.
+ currentCount += thisCount;
+ break;
+ case PERSONALITY_MIN:
+ case PERSONALITY_MINEXIST:
+ if (currentTimes == 0) {
+ currentCount = thisCount;
+ } else {
+ if (currentCount > thisCount)
+ currentCount = thisCount;
+ }
+ break;
+ case PERSONALITY_MAX:
+ if (currentCount < thisCount)
+ currentCount = thisCount;
+ break;
+ case PERSONALITY_ADD:
+ currentCount += thisCount;
+ break;
+ case PERSONALITY_AND:
+ case PERSONALITY_NAND:
+ case PERSONALITY_OR:
+ case PERSONALITY_XOR:
+ currentCount = 1;
+ break;
+ default:
+ fprintf(stderr, "ERROR - invalid personality in multipleOperations::operate\n");
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ break;
+ }
+
+ currentTimes++;
+ }
+
+ for (uint32 i=0; i<args->mergeFilesLen; i++)
+ delete R[i];
+ delete R;
+ delete W;
+ delete M;
+ delete C;
+}
diff --git a/meryl/merge.qsort.C b/meryl/merge.qsort.C
new file mode 100644
index 0000000..78654cb
--- /dev/null
+++ b/meryl/merge.qsort.C
@@ -0,0 +1,471 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "meryl.H"
+#include "libmeryl.H"
+
+
+using namespace std;
+
+#include <algorithm>
+
+struct mMer {
+ kMer _mer;
+ uint32 _cnt;
+ uint32 _off;
+};
+
+static
+int
+mMerGreaterThan(void const *a, void const *b) {
+ mMer const *A = (mMer const *)a;
+ mMer const *B = (mMer const *)b;
+ return(B->_mer.qsort_less(A->_mer));
+}
+
+
+
+class mMerList {
+public:
+ mMerList(uint32 maxSize) {
+ _posLen = 0;
+ _posMax = 2 * maxSize;
+ _pos = new uint32 [_posMax];
+
+ _mmmLen = 0;
+ _mmmMax = maxSize;
+ _mmm = new mMer [_mmmMax];
+ };
+ ~mMerList() {
+ delete [] _pos;
+ delete [] _mmm;
+ };
+
+ uint32 length(void) { return(_mmmLen); };
+
+ // Until we sort, first() is the last thing loaded.
+ // After we sort, first() is the lowest mer in the set.
+
+ kMer &first(void) { return(_mmm[_mmmLen-1]._mer); };
+ //kMer &last(void) { return(_mmm[0]._mer); };
+ //kMer &get(uint32 i) { return(_mmm[i]._mer); };
+
+ // Return the first (sorted order) thing in the list -- it's the last on the list.
+ kMer *pop(uint32 &cnt, uint32* &pos) {
+ if (_mmmLen == 0)
+ return(0L);
+
+ _mmmLen--;
+
+ assert(_sorted);
+
+ cnt = _mmm[_mmmLen]._cnt;
+ pos = 0L;
+
+ if (_mmm[_mmmLen]._off != ~uint32ZERO)
+ pos = _pos + _mmm[_mmmLen]._off;
+
+ return(&_mmm[_mmmLen]._mer);
+ }
+
+
+ // rebuild the position list, squeezes out empty items
+ void rebuild(void) {
+ if (_posLen > 0) {
+ uint32 *np = new uint32 [_posMax];
+
+ _posLen = 0;
+
+ for (uint32 i=0; i<_mmmLen; i++) {
+ mMer *m = _mmm + i;
+
+ if (m->_off != ~uint32ZERO) {
+ _mmm[_mmmLen]._off = _posLen;
+
+ for (uint32 p=0; p<m->_cnt; p++, _posLen++)
+ np[_posLen] = _pos[p];
+ }
+ }
+
+ delete [] _pos;
+ _pos = np;
+ }
+ };
+
+
+
+ // Read more mers from the file
+ void read(merylStreamReader *R, uint32 num) {
+ uint32 xxx = 0;
+
+ if (_mmmLen + num >= _mmmMax) {
+ fprintf(stderr, "Reallocate _mmm\n");
+ _mmmMax = _mmmMax + 2 * num;
+ mMer *tmp = new mMer [_mmmMax];
+ memcpy(tmp, _mmm, sizeof(mMer) * _mmmLen);
+ delete [] _mmm;
+ _mmm = tmp;
+ }
+
+ _sorted = false;
+
+ R->nextMer();
+
+ for (xxx=0; (xxx < num) && (R->validMer()); xxx++) {
+ if (_mmmMax <= _mmmLen) {
+ fprintf(stderr, "Reallocate _mmm\n");
+ _mmmMax *= 2;
+ mMer *tmp = new mMer [_mmmMax];
+ memcpy(tmp, _mmm, sizeof(mMer) * _mmmLen);
+ delete [] _mmm;
+ _mmm = tmp;
+ }
+
+ _mmm[_mmmLen]._mer = R->theFMer();
+ _mmm[_mmmLen]._cnt = R->theCount();
+ _mmm[_mmmLen]._off = ~uint32ZERO;
+
+ uint32 *pos = R->thePositions();
+ if (pos) {
+ _mmm[_mmmLen]._off = _posLen;
+
+ if (_posMax <= _posLen + _mmm[_mmmLen]._cnt) {
+ fprintf(stderr, "Reallocate _pos\n");
+ _posMax *= 2;
+ uint32 *tmp = new uint32 [_posMax];
+ memcpy(tmp, _pos, sizeof(uint32) * _posLen);
+ delete [] _pos;
+ _pos = tmp;
+ }
+
+ for (uint32 i=0; i<_mmm[_mmmLen]._cnt; i++, _posLen++)
+ _pos[_posLen] = pos[i];
+ }
+
+ _mmmLen++;
+
+ R->nextMer();
+ }
+
+ //fprintf(stderr, "read()-- now up to "uint32FMT" mers ("uint32FMT" pos); loaded "uint32FMT" out of "uint32FMT" requested.\n", _mmmLen, _posLen, xxx, num);
+ };
+
+
+ // Sort our list of mers
+ void sort(void) {
+ if (_sorted == false) {
+ //fprintf(stderr, "SORT BEG\n");
+ qsort_mt(_mmm, _mmmLen, sizeof(mMer), mMerGreaterThan, 8, 32 * 1024);
+ _sorted = true;
+ //fprintf(stderr, "SORT END\n");
+ }
+ };
+
+
+private:
+ bool _sorted;
+
+ uint32 _posLen;
+ uint32 _posMax;
+ uint32 *_pos;
+
+ uint32 _mmmLen;
+ uint32 _mmmMax;
+ mMer *_mmm;
+};
+
+
+
+
+void
+multipleOperations(merylArgs *args) {
+
+ char debugstring[256];
+ char debugstring2[256];
+
+ if (args->mergeFilesLen < 2) {
+ fprintf(stderr, "ERROR - must have at least two databases (you gave "uint32FMT")!\n", args->mergeFilesLen);
+ exit(1);
+ }
+ if (args->outputFile == 0L) {
+ fprintf(stderr, "ERROR - no output file specified.\n");
+ exit(1);
+ }
+ if ((args->personality != PERSONALITY_MERGE) &&
+ (args->personality != PERSONALITY_MIN) &&
+ (args->personality != PERSONALITY_MINEXIST) &&
+ (args->personality != PERSONALITY_MAX) &&
+ (args->personality != PERSONALITY_ADD) &&
+ (args->personality != PERSONALITY_AND) &&
+ (args->personality != PERSONALITY_NAND) &&
+ (args->personality != PERSONALITY_OR) &&
+ (args->personality != PERSONALITY_XOR)) {
+ fprintf(stderr, "ERROR - only personalities min, minexist, max, add, and, nand, or, xor\n");
+ fprintf(stderr, "ERROR - are supported in multipleOperations().\n");
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ }
+
+ merylStreamReader **R = new merylStreamReader* [args->mergeFilesLen];
+ merylStreamWriter *W = 0L;
+
+ uint32 maxSize = 512 * 1024;
+
+ mMerList *M = new mMerList(maxSize + maxSize / 4);
+
+
+ // Open the input files and load some mers - we need to do this
+ // just so we can check the mersizes/compression next.
+ //
+ for (uint32 i=0; i<args->mergeFilesLen; i++) {
+ R[i] = new merylStreamReader(args->mergeFiles[i]);
+ M->read(R[i], 1 + i);
+ }
+
+ // Verify that the mersizes are all the same
+ //
+ bool fail = false;
+ uint32 merSize = R[0]->merSize();
+ uint32 merComp = R[0]->merCompression();
+
+ for (uint32 i=0; i<args->mergeFilesLen; i++) {
+ fail |= (merSize != R[i]->merSize());
+ fail |= (merComp != R[i]->merCompression());
+ }
+
+ if (fail)
+ fprintf(stderr, "ERROR: mer sizes (or compression level) differ.\n"), exit(1);
+
+ // Open the output file, using the largest prefix size found in the
+ // input/mask files.
+ //
+ uint32 prefixSize = 0;
+ for (uint32 i=0; i<args->mergeFilesLen; i++)
+ if (prefixSize < R[i]->prefixSize())
+ prefixSize = R[i]->prefixSize();
+
+ W = new merylStreamWriter(args->outputFile, merSize, merComp, prefixSize);
+
+
+ kMer lastLoaded;
+
+ lastLoaded.setMerSize(merSize);
+ lastLoaded.smallest();
+
+ // Load mers from all files, remember the largest mer we load.
+ //
+ for (uint32 i=0; i<args->mergeFilesLen; i++) {
+ M->read(R[i], maxSize / args->mergeFilesLen);
+ if (lastLoaded < M->first())
+ lastLoaded = M->first();
+ }
+
+ // Make sure all files have at least that largest mer loaded.
+ //
+ for (uint32 i=0; i<args->mergeFilesLen; i++)
+ while (R[i]->validMer() && (R[i]->theFMer() <= lastLoaded))
+ M->read(R[i], 2 * 1024);
+
+ fprintf(stderr, "Initial load: length="uint32FMT" lastLoaded=%s\n",
+ M->length(), lastLoaded.merToString(debugstring));
+
+ M->sort();
+
+ bool allLoaded = false;
+ bool moreStuff = true;
+
+ kMer currentMer; // The current mer we're operating on
+ uint32 currentCount = uint32ZERO; // The count (operation dependent) of this mer
+ uint32 currentTimes = uint32ZERO; // Number of files it's in
+
+ uint32 currentPositionsMax = 0;
+ uint32 *currentPositions = 0L;
+
+ kMer *thisMer; // The mer we just read
+ uint32 thisCount = uint32ZERO; // The count of the mer we just read
+ uint32 *thisPositions = 0L;
+
+ speedCounter *C = new speedCounter(" %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, args->beVerbose);
+
+ currentMer.setMerSize(merSize);
+
+ while (moreStuff) {
+
+ // Load more stuff if needed. M is sorted, so first() is the
+ // smallest mer in the set - we're good up to and including
+ // lastLoaded.
+ //
+ if ((allLoaded == false) &&
+ ((M->length() == 0) || (lastLoaded < M->first()))) {
+
+#if 0
+ if (M->length() > 0)
+ fprintf(stderr, "LOADMORE length="uint32FMT" lastLoaded=%s first=%s\n",
+ M->length(), lastLoaded.merToString(debugstring2), M->first().merToString(debugstring));
+ else
+ fprintf(stderr, "LOADMORE length="uint32FMT" lastLoaded=%s first=EMPTY\n",
+ M->length(), lastLoaded.merToString(debugstring2));
+#endif
+
+ // We need to copy all the mers currently loaded into fresh
+ // storage, so we can deallocate the position storage. Yucky.
+ //
+ M->rebuild();
+
+ allLoaded = true;
+
+ // Load more stuff to give us a large collection of mers
+ //
+ uint32 additionalLoading = 8192;
+
+ if (maxSize / args->mergeFilesLen > M->length())
+ additionalLoading = maxSize / args->mergeFilesLen - M->length();
+
+ //fprintf(stderr, "LOADMORE adding "uint32FMT" from each file\n", additionalLoading);
+
+ lastLoaded.setMerSize(merSize);
+ lastLoaded.smallest();
+
+ for (uint32 i=0; i<args->mergeFilesLen; i++) {
+ if (R[i]->validMer()) {
+ M->read(R[i], additionalLoading);
+ if (lastLoaded < M->first())
+ lastLoaded = M->first();
+ allLoaded = false;
+ }
+ }
+
+ // Make sure all files have at least that largest mer loaded.
+ //
+ for (uint32 i=0; i<args->mergeFilesLen; i++)
+ while (R[i]->validMer() && (R[i]->theFMer() <= lastLoaded))
+ M->read(R[i], 2 * 1024);
+
+ M->sort();
+ }
+
+
+ // All done? Exit.
+ if (M->length() == 0)
+ moreStuff = false;
+
+ thisMer = M->pop(thisCount, thisPositions);
+
+ // If we've hit a different mer, write out the last one
+ if ((M->length() == 0) || (*thisMer != currentMer)) {
+ switch (args->personality) {
+ case PERSONALITY_MIN:
+ if (currentTimes == args->mergeFilesLen)
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_MERGE:
+ case PERSONALITY_MINEXIST:
+ case PERSONALITY_MAX:
+ case PERSONALITY_ADD:
+ W->addMer(currentMer, currentCount, currentPositions);
+ break;
+ case PERSONALITY_AND:
+ if (currentTimes == args->mergeFilesLen)
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_NAND:
+ if (currentTimes != args->mergeFilesLen)
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_OR:
+ W->addMer(currentMer, currentCount);
+ break;
+ case PERSONALITY_XOR:
+ if ((currentTimes % 2) == 1)
+ W->addMer(currentMer, currentCount);
+ break;
+ default:
+ fprintf(stderr, "ERROR - invalid personality in multipleOperations::write\n");
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ break;
+ }
+
+ currentMer = *thisMer;
+
+ currentCount = uint32ZERO;
+ currentTimes = uint32ZERO;
+
+ C->tick();
+ }
+
+
+ if (moreStuff == false)
+ break;
+
+
+ // Perform the operation
+ switch (args->personality) {
+ case PERSONALITY_MERGE:
+ if (thisPositions) {
+
+ if (currentPositionsMax == 0) {
+ currentPositionsMax = 1048576;
+ currentPositions = new uint32 [currentPositionsMax];
+ }
+
+ if (currentPositionsMax < currentCount + thisCount) {
+ while (currentPositionsMax < currentCount + thisCount)
+ currentPositionsMax *= 2;
+
+ uint32 *t = new uint32 [currentPositionsMax];
+ memcpy(t, currentPositions, sizeof(uint32) * currentCount);
+ delete [] currentPositions;
+ currentPositions = t;
+ }
+
+ if (thisCount < 16) {
+ for (uint32 i=0; i<thisCount; i++)
+ currentPositions[currentCount + i] = thisPositions[i];
+ } else {
+ memcpy(currentPositions + currentCount, thisPositions, sizeof(uint32) * thisCount);
+ }
+ }
+ // Otherwise, we're the same as ADD.
+ currentCount += thisCount;
+ break;
+ case PERSONALITY_MIN:
+ case PERSONALITY_MINEXIST:
+ if (currentTimes == 0) {
+ currentCount = thisCount;
+ } else {
+ if (currentCount > thisCount)
+ currentCount = thisCount;
+ }
+ break;
+ case PERSONALITY_MAX:
+ if (currentCount < thisCount)
+ currentCount = thisCount;
+ break;
+ case PERSONALITY_ADD:
+ currentCount += thisCount;
+ break;
+ case PERSONALITY_AND:
+ case PERSONALITY_NAND:
+ case PERSONALITY_OR:
+ case PERSONALITY_XOR:
+ currentCount = 1;
+ break;
+ default:
+ fprintf(stderr, "ERROR - invalid personality in multipleOperations::operate\n");
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ break;
+ }
+
+ currentTimes++;
+ }
+
+ for (uint32 i=0; i<args->mergeFilesLen; i++)
+ delete R[i];
+ delete R;
+ delete W;
+ delete M;
+ delete C;
+}
diff --git a/meryl/mervin.C b/meryl/mervin.C
new file mode 100644
index 0000000..bd35994
--- /dev/null
+++ b/meryl/mervin.C
@@ -0,0 +1,793 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include "bio++.H"
+#include "sweatShop.H"
+
+#include "libmeryl.H"
+
+#include <algorithm>
+
+using namespace std;
+
+// var, old, new -- returns true if "(var == old) and var <- new"
+//
+// CAS - #elif (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 40100
+
+const uint32 pileMax = 32768;
+
+const uint32 kmerSize = 22;
+const uint32 kmerBits = 2 * kmerSize;
+
+const uint32 pilePreSize = 6;
+const uint32 pilePreBits = 2 * pilePreSize;
+
+const uint32 sortPreSize = 10;
+const uint32 sortPreBits = 2 * sortPreSize;
+
+
+class kmerPile {
+public:
+ kmerPile(uint32 prefix) {
+ pileLen = 0;
+ pilePrefix = prefix;
+ };
+ ~kmerPile() {
+ };
+
+ void initialize(uint32 prefix) {
+ pileLen = 0;
+ pilePrefix = prefix;
+ };
+
+ void addMer(uint64 mer) {
+ pileDat[pileLen++] = mer;
+ };
+
+ void sort(void) {
+ ::sort(pileDat, pileDat + pileLen);
+ };
+
+ uint32 pileLen;
+ uint32 pilePrefix;
+
+ uint64 pileDat[pileMax];
+};
+
+
+
+
+class kmerSorter {
+public:
+ kmerSorter() {
+ sorterLocked = 0;
+ sorterLen = 0;
+ sorterMax = 4;
+ sorterMer = new uint64 [sorterMax];
+ sorterCnt = new uint32 [sorterMax];
+ };
+ ~kmerSorter() {
+ delete [] sorterMer;
+ delete [] sorterCnt;
+ };
+
+ void merge(uint64 *pileDat, uint32 pileLen) {
+ uint32 nmax = MAX(16, sorterLen + pileLen / 4);
+ uint64 *nmer = new uint64 [nmax];
+ uint32 *ncnt = new uint32 [nmax];
+ uint32 npos = 0;
+
+ assert(nmax > 0);
+
+ uint32 spos = 0;
+ uint32 ppos = 0;
+
+ bool useSorterFirst = false;
+
+ if ((sorterLen > 0) && (pileLen > 0)) {
+ useSorterFirst = (sorterMer[0] < pileDat[0]);
+
+ } else if (spos < sorterLen) {
+ useSorterFirst = true;
+
+ } else if (ppos < pileLen) {
+ useSorterFirst = false;
+
+ } else {
+ assert(0);
+ }
+
+ if (useSorterFirst) {
+ nmer[0] = sorterMer[spos];
+ ncnt[0] = sorterCnt[spos];
+ spos++;
+ } else {
+ nmer[0] = pileDat[ppos];
+ ncnt[0] = 1;
+ ppos++;
+ }
+
+ while ((spos < sorterLen) && (ppos < pileLen)) {
+
+ if (nmax <= npos + 1) {
+ nmax += (pileLen - ppos) + (sorterLen - spos) + 1;
+
+ uint64 *nmermore = new uint64 [nmax];
+ uint32 *ncntmore = new uint32 [nmax];
+
+ memcpy(nmermore, nmer, sizeof(uint64) * (npos + 1));
+ memcpy(ncntmore, ncnt, sizeof(uint32) * (npos + 1));
+
+ delete [] nmer; nmer = nmermore;
+ delete [] ncnt; ncnt = ncntmore;
+ }
+
+ if (nmer[npos] == sorterMer[spos]) {
+ ncnt[npos] += sorterCnt[spos];
+ spos++;
+
+ } else if (nmer[npos] == pileDat[ppos]) {
+ ncnt[npos] += 1;
+ ppos++;
+
+ } else if (sorterMer[spos] < pileDat[ppos]) {
+ npos++;
+ nmer[npos] = sorterMer[spos];
+ ncnt[npos] = sorterCnt[spos];
+ spos++;
+
+ } else {
+ npos++;
+ nmer[npos] = pileDat[ppos];
+ ncnt[npos] = 1;
+ ppos++;
+ }
+ }
+
+ uint32 remain = (sorterLen - spos) + (pileLen - ppos);
+
+ if (nmax < npos + 1 + remain) {
+ nmax = npos + 1 + remain;
+
+ uint64 *nmermore = new uint64 [nmax];
+ uint32 *ncntmore = new uint32 [nmax];
+
+ memcpy(nmermore, nmer, sizeof(uint64) * (npos + 1));
+ memcpy(ncntmore, ncnt, sizeof(uint32) * (npos + 1));
+
+ delete [] nmer; nmer = nmermore;
+ delete [] ncnt; ncnt = ncntmore;
+ }
+
+
+
+ while (spos < sorterLen) {
+ if (nmer[npos] == sorterMer[spos]) {
+ ncnt[npos] += sorterCnt[spos];
+ } else {
+ npos++;
+ nmer[npos] = sorterMer[spos];
+ ncnt[npos] = sorterCnt[spos];
+ }
+
+ spos++;
+ }
+
+
+ while (ppos < pileLen) {
+ if (nmer[npos] == pileDat[ppos]) {
+ ncnt[npos] += 1;
+ } else {
+ npos++;
+ nmer[npos] = pileDat[ppos];
+ ncnt[npos] = 1;
+ }
+
+ ppos++;
+ }
+
+ delete [] sorterMer;
+ delete [] sorterCnt;
+
+ sorterMer = nmer;
+ sorterCnt = ncnt;
+ sorterLen = npos + 1;
+ sorterMax = nmax;
+
+#if 1
+ bool broken = false;
+
+ for (uint32 i=1; i<sorterLen; i++) {
+ assert(sorterMer[i-1] < sorterMer[i]);
+ if (sorterMer[i-1] >= sorterMer[i])
+ broken = true;
+ }
+#endif
+
+ };
+
+ void write(uint32 prefix, FILE *F, merylStreamWriter *W) {
+ char km[64] = {0};
+ uint32 kp = pilePreSize;
+ uint32 np = 0;
+
+ {
+ uint32 pre = prefix;
+
+ for (uint32 pp=0; pp<pilePreSize; pp++) {
+ km[--kp] = bitsToLetter[pre & 0x03];
+ pre >>= 2;
+ }
+ }
+
+ np = kmerSize - pilePreSize;
+
+ for (uint32 ii=0; ii<sorterLen; ii++) {
+ uint64 mer = sorterMer[ii];
+
+ kp = kmerSize;
+
+ for (uint32 pp=0; pp<np; pp++) {
+ km[--kp] = bitsToLetter[mer & 0x03];
+ mer >>= 2;
+ }
+
+ fprintf(F, ">"uint32FMT"\n%s\n", sorterCnt[ii], km);
+
+ if (W)
+ W->addMer(prefix, pilePreBits,
+ sorterMer[ii], kmerBits - pilePreBits,
+ sorterCnt[ii],
+ 0L);
+ }
+ };
+
+ volatile uint32 sorterLocked;
+ uint32 sorterLen;
+ uint32 sorterMax;
+
+ uint64 *sorterMer;
+ uint32 *sorterCnt;
+};
+
+
+
+
+class kmerGlobal {
+public:
+ kmerGlobal() {
+ inName = NULL;
+ inFile = NULL;
+
+#if 0
+ inputBufferMax = 131072;
+ inputBufferLen = 0;
+ inputBufferPos = 0;
+ inputBuffer = new char [inputBufferMax];
+#endif
+
+ inputBufferMax = 0;
+ inputBufferLen = 0;
+ inputBufferPos = 0;
+ inputBuffer = NULL;
+
+ outPrefix = NULL;
+ outFile = NULL;
+
+ fkPre = 0;
+ fkMer = 0;
+
+ rkPre = 0;
+ rkMer = 0;
+
+ kLen = 0;
+
+ pilesFreeLock = 0;
+ pilesFreeLen = 2048;
+ pilesFreeMax = 2 << pilePreBits;
+ pilesFree = new kmerPile * [pilesFreeMax];
+
+ memset(pilesFree, 0, sizeof(kmerPile *) * pilesFreeMax);
+
+ piles = new kmerPile * [1 << pilePreBits];
+ sorters = new kmerSorter [1 << sortPreBits];
+
+ memset(piles, 0, sizeof(kmerPile *) * (1 << pilePreBits));
+
+ for (uint32 i=0; i<pilesFreeLen; i++)
+ pilesFree[i] = new kmerPile(0);
+
+ for (uint32 i=0; i< (1 << pilePreBits); i++)
+ piles[i] = new kmerPile(i);
+
+ pilesToSortLen = 0;
+ pilesToSortMax = 2 * (1 << pilePreBits);
+ pilesToSort = new kmerPile * [pilesToSortMax];
+ };
+ ~kmerGlobal() {
+ delete [] piles;
+ delete [] sorters;
+ delete [] pilesToSort;
+ //delete [] inputBuffer;
+ };
+
+
+ void initialize(void) {
+ //inBuffer = new readBuffer(inName, 0);
+
+#if 1
+ inputBufferMax = 0;
+ inputBufferLen = 0;
+ inputBufferPos = 0;
+ inputBuffer = (char *)mapFile(inName, &inputBufferLen, 'r');
+#endif
+
+ naptime.tv_sec = 0;
+ naptime.tv_nsec = 166666666ULL; // 1/6 second
+ naptime.tv_nsec = 250000ULL;
+ };
+
+ kmerPile *getFreePile(uint32 prefix) {
+ kmerPile *pp;
+
+ while (__sync_bool_compare_and_swap(&pilesFreeLock, 0, 1) == false)
+ nanosleep(&naptime, 0L);
+
+ assert(pilesFreeLock == 1);
+
+ if (pilesFreeLen == 0) {
+ pilesFreeLock = 0;
+ //fprintf(stderr, "ALLOCATE PILE!\n");
+ pp = new kmerPile(prefix);
+
+ } else {
+ pp = pilesFree[--pilesFreeLen];
+ pilesFreeLock = 0;
+ }
+
+ pp->initialize(prefix);
+
+ return(pp);
+ };
+
+ void releasePile(kmerPile *pile) {
+
+ if (pilesFreeLen >= pilesFreeMax) {
+ //fprintf(stderr, "DELETE PILE!\n");
+ delete pile;
+
+ } else {
+ while (__sync_bool_compare_and_swap(&pilesFreeLock, 0, 1) == false)
+ nanosleep(&naptime, 0L);
+
+ assert(pilesFreeLock == 1);
+
+ pilesFree[pilesFreeLen++] = pile;
+
+ pilesFreeLock = 0;
+ }
+
+ };
+
+
+ void addToPile(uint64 pre, uint64 mer) {
+
+ assert(piles[pre] != NULL);
+ //if (piles[pre] == NULL)
+ // piles[pre] = getFreePile(pre);
+
+ if (piles[pre]->pileLen < pileMax) {
+ piles[pre]->addMer(mer);
+ return;
+ }
+
+ if (pilesToSortMax <= pilesToSortLen) {
+ fprintf(stderr, "realloc\n");
+ exit(1);
+ }
+
+ pilesToSort[pilesToSortLen++] = piles[pre];
+
+ piles[pre] = getFreePile(pre);
+ piles[pre]->addMer(mer);
+ };
+
+
+ kmerPile *getFullPile(void) {
+ if (pilesToSortLen == 0)
+ return(NULL);
+
+ //fprintf(stderr, "return pile "uint32FMT"\n", pilesToSort[pilesToSortLen-1]->pilePrefix);
+ return(pilesToSort[--pilesToSortLen]);
+ };
+
+
+ kmerPile *allDataLoaded(void) {
+
+ for (uint32 pp=0; pp < (1 << pilePreBits); pp++) {
+ if ((piles[pp] != NULL) &&
+ (piles[pp]->pileLen > 0)) {
+ //fprintf(stderr, "Add pile "uint32FMT" to list.\n", pp);
+ pilesToSort[pilesToSortLen++] = piles[pp];
+ } else {
+ delete piles[pp];
+ }
+
+ piles[pp] = NULL;
+ }
+
+ fprintf(stderr, "allDataLoaded()-- pilesToSortLen = "uint32FMT"\n", pilesToSortLen);
+
+ return(getFullPile());
+ };
+
+
+ void addBases(uint32 bgn, uint32 len) {
+ uint32 kp2 = kmerBits - pilePreBits - 2;
+ uint32 pp2 = pilePreBits - 2;
+
+ uint64 mpp = uint64MASK(pilePreBits);
+ uint64 mkp = uint64MASK(kmerBits - pilePreBits);
+
+ for (uint32 pos=0; pos<len; pos++) {
+ uint64 bt = letterToBits[ inputBuffer[bgn+pos] ];
+
+ if (bt > 4) {
+ kLen = 0;
+ continue;
+ }
+
+ uint64 tm = 0;
+
+ tm = fkMer >> kp2;
+ tm &= 0x00000003;
+
+ fkPre <<= 2;
+ fkPre |= tm;
+
+ fkMer <<= 2;
+ fkMer |= bt;
+
+ tm = rkMer & 0x00000003;
+
+ rkPre >>= 2;
+ rkPre |= tm << pp2;
+
+ rkMer >>= 2;
+ rkMer |= bt << kp2;
+
+ kLen++;
+
+ if (kLen < kmerSize)
+ continue;
+
+ kLen = kmerSize;
+
+ fkPre &= mpp;
+ fkMer &= mkp;
+
+ rkPre &= mpp;
+ rkMer &= mkp;
+
+ addToPile(fkPre, fkMer);
+ addToPile(rkPre, rkMer);
+ }
+ }
+
+ bool addBaseToKmer(char base) {
+ uint64 bt = letterToBits[base];
+
+ if (bt > 4) {
+ kLen = 0;
+ return(false);
+ }
+
+ uint64 tm = 0;
+
+ tm = fkMer >> (kmerBits - pilePreBits - 2);
+ tm &= 0x00000003;
+
+ fkPre <<= 2;
+ fkPre |= tm;
+
+ fkMer <<= 2;
+ fkMer |= bt;
+
+ tm = rkMer & 0x00000003;
+
+ rkPre >>= 2;
+ rkPre |= tm << (pilePreBits - 2);
+
+ rkMer >>= 2;
+ rkMer |= bt << (kmerBits - pilePreBits - 2);
+
+ kLen++;
+
+ if (kLen < kmerSize) {
+ return(false);
+ }
+
+ kLen = kmerSize;
+
+ fkPre &= uint64MASK(pilePreBits);
+ fkMer &= uint64MASK(kmerBits - pilePreBits);
+
+ rkPre &= uint64MASK(pilePreBits);
+ rkMer &= uint64MASK(kmerBits - pilePreBits);
+
+ addToPile(fkPre, fkMer);
+ addToPile(rkPre, rkMer);
+
+ return(true);
+ };
+
+
+ void write(void) {
+ char outName[FILENAME_MAX];
+
+ sprintf(outName, "%s.fasta", outPrefix);
+
+ errno = 0;
+ FILE *F = fopen(outName, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s' for writing: %s\n", outName, strerror(errno)), exit(1);
+
+ //merylStreamWriter *W = new merylStreamWriter(outPrefix, kmerSize, 0, sortPreBits, false);
+
+ for (uint32 ss=0; ss < (1 << sortPreBits); ss++)
+ sorters[ss].write(ss, F, NULL);
+
+ fclose(F);
+ //delete W;
+ }
+
+ char *inName;
+ FILE *inFile;
+
+ readBuffer *inBuffer;
+
+ uint64 inputBufferMax;
+ uint64 inputBufferLen;
+ uint64 inputBufferPos;
+ char *inputBuffer;
+
+ char *outPrefix;
+ FILE *outFile;
+
+ uint64 fkPre; // Forward loaded kmer
+ uint64 fkMer;
+
+ uint64 rkPre; // Reverse loaded kmer
+ uint64 rkMer;
+
+ uint32 kLen;
+
+ uint32 pilesFreeLock;
+ uint32 pilesFreeLen;
+ uint32 pilesFreeMax;
+ kmerPile **pilesFree;
+
+ kmerPile **piles;
+ kmerSorter *sorters;
+
+ struct timespec naptime;
+
+ uint32 pilesToSortLen;
+ uint32 pilesToSortMax;
+ kmerPile **pilesToSort;
+};
+
+
+
+
+
+uint64 bytesLoaded = 0;
+uint64 basesLoaded = 0;
+speedCounter bytes(" bytes %8.0f Mbytes (%8.5f Mbytes/sec\r", 1048576, 1048576, true);
+
+ // Reads input, constructs kmers, adds kmers to piles of kmers.
+void*
+sifterThread(void *global) {
+ kmerGlobal *glob = (kmerGlobal *)global;
+ kmerPile *pile = glob->getFullPile();
+
+ if (pile)
+ return(pile);
+
+ //if ((glob->inFile == NULL) && (glob->inBuffer == NULL))
+ // return(NULL);
+
+ anotherBase:
+ //bytesLoaded++;
+ //if ((bytesLoaded % (16 * 1048576)) == 0)
+ // fprintf(stderr, "sifterThread()-- loaded "uint64FMT" MB.\n", bytesLoaded >> 20);
+
+#if 0
+ // Uses the readBuffer in char-by-char mode
+ //
+ char ch = glob->inBuffer->read();
+ bytes.tick();
+
+ if (glob->inBuffer->eof()) {
+ delete glob->inBuffer;
+ glob->inBuffer = NULL;
+ return(glob->allDataLoaded());
+ }
+
+ if (glob->addBaseToKmer(ch) == false)
+ goto anotherBase;
+
+#endif
+
+
+#if 0
+ // Uses the readBuffer in block-copy mode
+ //
+ uint32 len = glob->inBuffer->read(glob->inputBuffer, glob->inputBufferMax);
+
+ if (len == 0) {
+ delete glob->inBuffer;
+ glob->inBuffer = NULL;
+ return(glob->allDataLoaded());
+ }
+
+ glob->addBases(0, len);
+ bytes.tick(len);
+
+#endif
+
+#if 1
+ // Uses a direct mmap'd file
+ //
+ uint64 len = glob->inputBufferLen - glob->inputBufferPos;
+
+ if (len == 0)
+ return(NULL);
+
+ if (len > 16 * 1048576)
+ len = 16 * 1048576;
+
+ //fprintf(stderr, "Add "uint64FMT" bases.\n", len);
+
+ glob->addBases(glob->inputBufferPos, len);
+ bytes.tick(len);
+
+ glob->inputBufferPos += len;
+#endif
+
+ pile = glob->getFullPile();
+
+ if (pile == NULL)
+ goto anotherBase;
+
+ return(pile);
+}
+
+
+
+ // Takes a pile of kmers, sorts it, and them merges into the appropriate kmerSorter objects.
+void
+sorterThread(void *global, void *thread, void *thing) {
+ kmerGlobal *glob = (kmerGlobal *)global;
+ kmerPile *pile = (kmerPile *)thing;
+
+ struct timespec naptime;
+ naptime.tv_sec = 0;
+ naptime.tv_nsec = 166666666ULL; // 1/6 second
+ naptime.tv_nsec = 250000ULL;
+
+ if (pile->pileLen == 0)
+ // Nothing to add.
+ return;
+
+ pile->sort();
+
+ uint32 pileBgn = 0;
+ uint32 pileEnd = 1;
+
+ uint32 pileMaskShift = sortPreBits - pilePreBits;
+ uint32 pileDataShift = kmerBits - sortPreBits;
+
+ uint64 pileToSortPreMask = uint64MASK(sortPreBits - pilePreBits) << (kmerBits - sortPreBits);
+ uint64 pileToSortMask = uint64MASK(kmerBits - sortPreBits);
+
+ uint32 sortPre = 0;
+ uint64 pileToSort = 0;
+
+ while (pileBgn < pile->pileLen) {
+ sortPre = (pile->pilePrefix << pileMaskShift) | (pile->pileDat[pileBgn] >> pileDataShift);
+ pileToSort = pile->pileDat[pileBgn] & pileToSortPreMask;
+
+ //fprintf(stderr, "0x"uint64HEX"\n", pileToSortPreMask);
+ //fprintf(stderr, "0x"uint64HEX"\n", pileToSortMask);
+
+ while ((pileEnd < pile->pileLen) &&
+ ((pile->pileDat[pileEnd] & pileToSortPreMask) == pileToSort)) {
+ //fprintf(stderr, "0x"uint64HEX" -> 0x"uint64HEX" "uint64FMT"\n",
+ // pile->pileDat[pileEnd],
+ // pile->pileDat[pileEnd] & pileToSortMask,
+ // pile->pileDat[pileEnd] & pileToSortMask);
+ pile->pileDat[pileEnd] &= pileToSortMask;
+ pileEnd++;
+ }
+
+ while (__sync_bool_compare_and_swap(&glob->sorters[sortPre].sorterLocked, 0, 1) == false)
+ nanosleep(&naptime, 0L);
+
+ assert(glob->sorters[sortPre].sorterLocked == 1);
+
+ glob->sorters[sortPre].merge(pile->pileDat + pileBgn, pileEnd - pileBgn);
+
+ glob->sorters[sortPre].sorterLocked = 0;
+
+ pileBgn = pileEnd;
+ }
+}
+
+
+
+// Does nothing but delete the pile object. We don't output till the end.
+void
+nullThread(void *global, void *thing) {
+ kmerGlobal *glob = (kmerGlobal *)global;
+ kmerPile *pile = (kmerPile *)thing;
+
+ glob->releasePile(pile);
+}
+
+
+
+int
+main(int argc, char **argv) {
+ kmerGlobal *kg = new kmerGlobal;
+
+ int arg=1;
+ int err=0;
+
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-i") == 0)
+ kg->inName = argv[++arg];
+
+ else if (strcmp(argv[arg], "-o") == 0)
+ kg->outPrefix = argv[++arg];
+
+ else
+ err++;
+
+ arg++;
+ }
+ if (kg->inName == NULL)
+ err++;
+ if (kg->outPrefix == NULL)
+ err++;
+ if (err) {
+ fprintf(stderr, "usage: %s -i in.sequence -i prefix\n", argv[0]);
+ exit(1);
+ }
+
+ kg->initialize();
+
+ sweatShop *ss = new sweatShop(sifterThread,
+ sorterThread,
+ nullThread);
+
+ ss->setLoaderBatchSize(512);
+
+ ss->setNumberOfWorkers(1);
+ ss->setWriterQueueSize(16384);
+
+ //for (uint32 i=0; i<config._numSearchThreads; i++)
+ // ss->setThreadData(i, new searcherState(i));
+
+ ss->run(kg, true);
+
+ delete ss;
+
+ kg->write();
+
+ delete kg;
+
+ exit(0);
+}
diff --git a/meryl/meryl.C b/meryl/meryl.C
new file mode 100644
index 0000000..a2a5f98
--- /dev/null
+++ b/meryl/meryl.C
@@ -0,0 +1,72 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include "bio++.H"
+#include "meryl.H"
+
+int
+main(int argc, char **argv) {
+ merylArgs *args = new merylArgs(argc, argv);
+
+ switch (args->personality) {
+ case 'P':
+ estimate(args);
+ break;
+
+ case 'B':
+ build(args);
+ break;
+
+ case 'd':
+ dumpDistanceBetweenMers(args);
+ break;
+ case 't':
+ dumpThreshold(args);
+ break;
+ case 'p':
+ dumpPositions(args);
+ break;
+ case 'c':
+ countUnique(args);
+ break;
+ case 'h':
+ plotHistogram(args);
+ break;
+
+ case PERSONALITY_MIN:
+ case PERSONALITY_MINEXIST:
+ case PERSONALITY_MAX:
+ case PERSONALITY_MAXEXIST:
+ case PERSONALITY_ADD:
+ case PERSONALITY_AND:
+ case PERSONALITY_NAND:
+ case PERSONALITY_OR:
+ case PERSONALITY_XOR:
+ multipleOperations(args);
+ break;
+
+ case PERSONALITY_SUB:
+ case PERSONALITY_ABS:
+ case PERSONALITY_DIVIDE:
+ binaryOperations(args);
+ break;
+
+ case PERSONALITY_LEQ:
+ case PERSONALITY_GEQ:
+ case PERSONALITY_EQ:
+ unaryOperations(args);
+ break;
+
+ default:
+ args->usage();
+ fprintf(stderr, "%s: unknown personality. Specify -P, -B, -S or -M!\n", args->execName);
+ exit(1);
+ break;
+ }
+
+ delete args;
+
+ return(0);
+}
diff --git a/meryl/meryl.H b/meryl/meryl.H
new file mode 100644
index 0000000..7527df1
--- /dev/null
+++ b/meryl/meryl.H
@@ -0,0 +1,128 @@
+#ifndef MERYL_H
+#define MERYL_H
+
+#include "bio++.H"
+
+#define PERSONALITY_MERGE 0xff
+
+#define PERSONALITY_MIN 0x01
+#define PERSONALITY_MINEXIST 0x02
+#define PERSONALITY_MAX 0x03
+#define PERSONALITY_MAXEXIST 0x04
+#define PERSONALITY_ADD 0x05
+#define PERSONALITY_SUB 0x06
+#define PERSONALITY_DIVIDE 0x07
+#define PERSONALITY_ABS 0x08
+
+#define PERSONALITY_AND 0x10
+#define PERSONALITY_NAND 0x11
+#define PERSONALITY_OR 0x12
+#define PERSONALITY_XOR 0x13
+#define PERSONALITY_LEQ 0x14
+#define PERSONALITY_GEQ 0x15
+#define PERSONALITY_EQ 0x16
+
+class merylArgs {
+public:
+ merylArgs(int argc, char **argv);
+ merylArgs(const char *prefix);
+ ~merylArgs();
+
+ void usage(void);
+ void clear(void);
+
+ uint64 hash(kMer const &mer) {
+ return(mer.startOfMer(numBuckets_log2));
+ };
+
+ bool writeConfig(void);
+ bool readConfig(const char *prefix);
+public:
+ char *execName;
+ char *options;
+
+ bool beVerbose;
+ bool doForward;
+ bool doReverse;
+ bool doCanonical;
+
+ char *inputFile;
+ char *outputFile;
+ char *queryFile;
+
+ uint32 merSize;
+ uint32 merComp;
+ bool positionsEnabled;
+
+ uint64 numMersEstimated;
+ uint64 numMersActual;
+
+ uint64 numBasesActual;
+
+ uint64 mersPerBatch;
+ uint64 basesPerBatch;
+
+ uint64 numBuckets;
+ uint32 numBuckets_log2;
+ uint32 merDataWidth;
+ uint64 merDataMask;
+ uint32 bucketPointerWidth;
+
+ uint32 numThreads;
+ uint64 memoryLimit;
+ uint64 segmentLimit;
+ bool configBatch;
+ bool countBatch;
+ bool mergeBatch;
+ uint32 batchNumber;
+
+ char *sgeJobName;
+ char *sgeBuildOpt;
+ char *sgeMergeOpt;
+ bool isOnGrid;
+
+ uint32 lowCount;
+ uint32 highCount;
+ uint32 desiredCount;
+
+ bool outputCount;
+ bool outputAll;
+ bool outputPosition;
+
+ bool includeDefLine;
+ bool includeMer;
+
+ uint32 mergeFilesMax;
+ uint32 mergeFilesLen;
+ char **mergeFiles;
+
+ uint32 personality;
+};
+
+
+uint64
+estimateNumMersInMemorySize(uint32 merSize,
+ uint32 mem,
+ bool positionsEnabled,
+ bool beVerbose);
+
+uint32
+optimalNumberOfBuckets(uint32 merSize,
+ uint64 numMers,
+ bool positionsEnabled);
+
+void estimate(merylArgs *args);
+void build(merylArgs *args);
+
+void multipleOperations(merylArgs *args);
+void binaryOperations(merylArgs *args);
+void unaryOperations(merylArgs *args);
+
+void dump(merylArgs *args);
+void dumpThreshold(merylArgs *args);
+void dumpPositions(merylArgs *args);
+void countUnique(merylArgs *args);
+void dumpDistanceBetweenMers(merylArgs *args);
+void plotHistogram(merylArgs *args);
+
+#endif // MERYL_H
diff --git a/meryl/simple.C b/meryl/simple.C
new file mode 100644
index 0000000..365d74d
--- /dev/null
+++ b/meryl/simple.C
@@ -0,0 +1,164 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "bio++.H"
+#include "meryl.H"
+
+#include "libmeryl.H"
+#include "seqStream.H"
+#include "merStream.H"
+
+using namespace std;
+
+// A very simple mer counter. Allocates a gigantic 32-bit array,
+// populates the array with mers, sorts, writes output.
+
+int
+main(int argc, char **argv) {
+ char *inName = 0L;
+ char *otName = 0L;
+ uint32 merSize = 22;
+ uint32 merCompression = 1;
+
+ bool doForward = true;
+ bool doReverse = false;
+ bool doCanonical = false;
+
+ speedCounter *C = 0L;
+ merStream *M = 0L;
+ merylStreamWriter *W = 0L;
+
+ uint64 numMers = 0;
+
+ uint64 *theMers = 0L;
+ uint64 theMersMax = 0;
+ uint64 theMersLen = 0;
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-i") == 0) {
+ inName = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-o") == 0) {
+ otName = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-m") == 0) {
+ merSize = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-f") == 0) {
+ doForward = true;
+ doReverse = false;
+ doCanonical = false;
+
+ } else if (strcmp(argv[arg], "-r") == 0) {
+ doForward = false;
+ doReverse = true;
+ doCanonical = false;
+
+ } else if (strcmp(argv[arg], "-C") == 0) {
+ doForward = false;
+ doReverse = false;
+ doCanonical = true;
+
+ } else if (strcmp(argv[arg], "-c") == 0) {
+ merCompression = atoi(argv[++arg]);
+
+ } else {
+ fprintf(stderr, "unknown option '%s'\n", argv[arg]);
+ err++;
+ }
+
+ arg++;
+ }
+ if (inName == 0L) {
+ fprintf(stderr, "no input given with '-i'\n");
+ err++;
+ }
+ if (otName == 0L) {
+ fprintf(stderr, "no output given with '-o'\n");
+ err++;
+ }
+ if (err)
+ exit(1);
+
+
+ {
+ M = new merStream(new kMerBuilder(merSize, merCompression),
+ new seqStream(inName),
+ true, true);
+ numMers = M->approximateNumberOfMers();
+ delete M;
+ }
+
+ fprintf(stderr, "Guessing "uint64FMT" mers in input '%s'\n", numMers, inName);
+ fprintf(stderr, "Allocating "uint64FMT"MB for mer storage.\n", numMers * 8 >> 20);
+
+ theMers = new uint64 [numMers];
+ theMersLen = 0;
+ theMersMax = numMers;
+
+ ////////////////////////////////////////
+
+ C = new speedCounter(" Counting mers in buckets: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, 1);
+ M = new merStream(new kMerBuilder(merSize, merCompression),
+ new seqStream(inName),
+ true, true);
+ //M->setRange(args->mersPerBatch * segment, args->mersPerBatch * segment + args->mersPerBatch);
+
+ while (M->nextMer()) {
+ if (doForward)
+ theMers[theMersLen++] = M->theFMer();
+
+ if (doReverse)
+ theMers[theMersLen++] = M->theRMer();
+
+ if (doCanonical)
+ theMers[theMersLen++] = (M->theFMer() <= M->theRMer()) ? M->theFMer() : M->theRMer();
+
+ C->tick();
+ }
+
+ delete C;
+ delete M;
+
+ fprintf(stderr, "Found "uint64FMT" mers in input '%s'\n", theMersLen, inName);
+
+ if (theMersLen > theMersMax)
+ fprintf(stderr, "ERROR: too many mers in input!\n"), exit(1);
+
+ ////////////////////////////////////////
+
+ fprintf(stderr, "sorting\n");
+
+ sort(theMers, theMers + theMersLen);
+
+ ////////////////////////////////////////
+
+ C = new speedCounter(" Writing output: %7.2f Mmers -- %5.2f Mmers/second\r", 1000000.0, 0x1fffff, 1);
+ W = new merylStreamWriter(otName,
+ merSize, merCompression,
+ 16,
+ false);
+
+ kMer mer(merSize);
+
+ for (uint64 i=0; i<theMersLen; i++) {
+ mer.setWord(0, theMers[i]);
+ W->addMer(mer, 1, 0L);
+ C->tick();
+ }
+
+ delete C;
+ delete W;
+
+ ////////////////////////////////////////
+
+ delete [] theMers;
+
+ exit(0);
+}
diff --git a/meryl/test/Makefile b/meryl/test/Makefile
new file mode 100644
index 0000000..567f874
--- /dev/null
+++ b/meryl/test/Makefile
@@ -0,0 +1,37 @@
+PROG = stupidcount exhaustive
+INCLUDE = -I.. -I../../libutil -I../../libbio -I../../libmeryl
+LIBS = -L.. -L../../libutil -L../../libbio -L../../libmeryl -lmeryl -lbio -lutil -lm
+
+MERSIZE = 26
+
+include ../../Make.compilers
+
+all: $(PROG) test-reduce
+
+stupidcount: stupidcount.C
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o stupidcount.o stupidcount.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o stupidcount stupidcount.o $(LIBS)
+
+exhaustive: exhaustive.C kmerlite.H
+ $(CXX) $(CXXFLAGS_COMPILE) -c -o exhaustive.o exhaustive.C $(INCLUDE)
+ $(CXX) $(CXXLDFLAGS) -o exhaustive exhaustive.o $(LIBS)
+
+test-exhaustive: exhaustive ../meryl ../../leaff/leaff
+ ../../leaff/leaff -G 1000 10000 40000 > g.fasta
+ ../meryl -B -s g.fasta -o s -m $(MERSIZE) -threads 7
+ ./exhaustive -m s -f g.fasta
+
+test-reduce: ../meryl
+ ../meryl -B -f -m 20 -s test-seq1.fasta -o 1 # Build the initial table
+ ../meryl -Dt -n 0 -s 1 > 2.reduce.fasta # Dump the initial table as fasta
+ ../meryl -B -f -m 20 -s 2.reduce.fasta -o 2 # Build a new table on the dumped fasta
+ ../meryl -M sub -s 1 -s 2 -o 3 # Remove one copy of each mer
+ ../meryl -Dt -n 1 -s 3 # Dump the resulting file
+ echo 1 10 9 1 is correct
+ touch test-reduce
+
+test:
+ ../meryl -B -s test-seq1.fasta -o t -m 20
+
+clean:
+ rm -f $(PROG) *.o *.mc??? test-reduce *.seqStore* g.fasta 2.reduce.fasta *.fastaidx
diff --git a/meryl/test/exhaustive.C b/meryl/test/exhaustive.C
new file mode 100644
index 0000000..1ca0b34
--- /dev/null
+++ b/meryl/test/exhaustive.C
@@ -0,0 +1,171 @@
+#include "bio++.H"
+#include "libmeryl.H"
+#include "kmerlite.H"
+
+// This tests that all the mers in an input fasta file are counted
+// properly. It does not test that the meryl output contains exactly
+// those mers, just that those mers are there.
+//
+// If you can fit into one batch, then it _will_ verift that the
+// meryl output is exactly correct.
+//
+// Reads a meryl-format kmer count in chunks. Each chunk is stored
+// in a searchable structure (we should be using, say, an extended
+// existDB, but we're using a balanced binary tree). The entire
+// source fasta file is then streamed against the kmer chunk,
+// decrementing the count for each mer. When the whole file is
+// streamed, any kmers with positive count are reported.
+
+
+// NB: My hacked kazlib returns a pointer to whatever we give it.
+// Since we gave it a pointer to an object, it gives us back a
+// pointer to "a pointer to an object". Hence, this ugliness.
+//
+int
+kMerLiteSort(void const *a, void const *b) {
+ kMerLite const *A = *((kMerLite * const *)a);
+ kMerLite const *B = *((kMerLite * const *)b);
+
+ if (*A < *B) return(-1);
+ if (*A > *B) return(1);
+ return(0);
+}
+
+
+
+int
+main(int argc, char **argv) {
+
+ char *merylCount = 0L;
+ char *fastaName = 0L;
+
+ int arg=1;
+ while (arg < argc) {
+
+ if (strcmp(argv[arg], "-m") == 0) {
+ merylCount = argv[++arg];
+ } else if (strcmp(argv[arg], "-f") == 0) {
+ fastaName = argv[++arg];
+ } else {
+ fprintf(stderr, "unknown option '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ if ((merylCount == 0L) || (fastaName == 0L)) {
+ fprintf(stderr, "usage: %s -m <meryl-name-prefix> -f <fasta-file>\n", argv[0]);
+ exit(1);
+ }
+
+
+ // Open the count files
+ //
+ merylStreamReader *MSR = new merylStreamReader(merylCount);
+
+ fprintf(stderr, "Mers are "uint32FMT" bases.\n", MSR->merSize());
+ fprintf(stderr, "There are "uint64FMT" unique (copy = 1) mers.\n", MSR->numberOfUniqueMers());
+ fprintf(stderr, "There are "uint64FMT" distinct mers.\n", MSR->numberOfDistinctMers());
+ fprintf(stderr, "There are "uint64FMT" mers total.\n", MSR->numberOfTotalMers());
+
+ // Guess how many mers we can fit into 512MB, then report how many chunks we need to do.
+
+ uint32 merSize = MSR->merSize();
+ uint64 memoryLimit = 700 * 1024 * 1024;
+ uint64 perMer = sizeof(kMerLite) + sizeof(dnode_t);
+ uint64 mersPerBatch = memoryLimit / perMer;
+ uint32 numBatches = MSR->numberOfDistinctMers() / mersPerBatch;
+ uint32 batch = 0;
+
+ dnode_t *nodes = new dnode_t [mersPerBatch];
+ kMerLite *mers = new kMerLite [mersPerBatch];
+
+ if (MSR->numberOfDistinctMers() % mersPerBatch)
+ numBatches++;
+
+ fprintf(stderr, "perMer: "uint64FMT" bytes ("uint64FMT" for kMerLite, "uint64FMT" for dnode_t.\n",
+ perMer, (uint64)sizeof(kMerLite), (uint64)sizeof(dnode_t));
+ fprintf(stderr, "We can fit "uint64FMT" mers into "uint64FMT"MB.\n", mersPerBatch, memoryLimit >> 20);
+ fprintf(stderr, "So we need "uint32FMT" batches to verify the count.\n", numBatches);
+
+ while (MSR->validMer()) {
+ uint64 mersRemain = mersPerBatch;
+ dict_t *merDict = dict_create(mersPerBatch, kMerLiteSort);
+
+ batch++;
+
+ // STEP 1: Insert mersPerBatch into the merDict
+ //
+ fprintf(stderr, "STEP 1 BATCH "uint32FMTW(2)": Insert into merDict\n", batch);
+ while (MSR->nextMer() && mersRemain) {
+ mersRemain--;
+
+ mers[mersRemain] = MSR->theFMer();
+
+ // initialize the node with the value, then insert the node
+ // into the tree using the key
+
+ int32 val = (int32)MSR->theCount();
+ dnode_init(&nodes[mersRemain], (void *)val);
+ dict_insert(merDict, &nodes[mersRemain], &mers[mersRemain]);
+ }
+
+ // STEP 2: Stream the original file, decrementing the count
+ //
+ fprintf(stderr, "STEP 2 BATCH "uint32FMTW(2)": Stream fasta\n", batch);
+ seqStream *CS = new seqStream(fastaName, true);
+ merStream *MS = new merStream(new kMerBuilder(merSize), CS);
+
+ kMerLite mer;
+ dnode_t *nod;
+
+ while (MS->nextMer()) {
+ mer = MS->theFMer();
+
+ nod = dict_lookup(merDict, &mer);
+
+ if (nod != 0L) {
+ int32 val = (int32)dnode_get(nod);
+ val--;
+ dnode_put(nod, (void *)val);
+ } else {
+ // Unless the whole meryl file fit into our merDict, we cannot warn if
+ // we don't find mers.
+ //
+ if (numBatches == 1) {
+ char str[1024];
+ fprintf(stderr, "Didn't find node for mer '%s'\n", mer.merToString(merSize, str));
+ }
+ }
+ }
+
+ delete MS;
+ delete CS;
+
+ // STEP 3: Check every node in the tree to make sure that the counts
+ // are exactly zero.
+ //
+ fprintf(stderr, "STEP 3 BATCH "uint32FMTW(2)": Check\n", batch);
+ nod = dict_first(merDict);
+ while (nod) {
+ int32 val = (int32)dnode_get(nod);
+ kMerLite const *nodmer = (kMerLite const *)dnode_getkey(nod);
+
+ if (val != 0) {
+ char str[1024];
+ fprintf(stderr, "Got count "int32FMT" for mer '%s'\n",
+ val,
+ nodmer->merToString(merSize, str));
+ }
+
+ nod = dict_next(merDict, nod);
+ }
+
+
+ // STEP 4: Destroy the dictionary.
+ //
+ fprintf(stderr, "STEP 4 BATCH "uint32FMTW(2)": Destroy\n", batch);
+ while ((nod = dict_first(merDict)))
+ dict_delete(merDict, nod);
+ dict_destroy(merDict);
+ }
+}
diff --git a/meryl/test/kmerlite.H b/meryl/test/kmerlite.H
new file mode 100644
index 0000000..b3de1f8
--- /dev/null
+++ b/meryl/test/kmerlite.H
@@ -0,0 +1,133 @@
+#include "bio++.H"
+
+#ifndef KMER_LITE_H
+#define KMER_LITE_H
+
+////////////////////////////////////////
+//
+// This is kMerLite -- derived from kMer.H, removing
+// most of the accessors.
+//
+// Assumes that KMER_WORDS is already defined.
+
+
+class kMerLite {
+public:
+
+ // Used by some of the test routines.
+ void dump(void) const {
+ for (uint32 i=0; i<KMER_WORDS; i++)
+ fprintf(stderr, "kMerLite["uint32FMTW(2)"] = "uint64HEX"\n", i, _wd[i]);
+ };
+
+
+public:
+ void copy(kMer const &that) {
+ for (uint32 i=0; i<KMER_WORDS; i++)
+ _wd[i] = that.getWord(i);
+ };
+ void clear(void) {
+ for (uint32 i=0; i<KMER_WORDS; i++)
+ _wd[i] = uint64ZERO;
+ };
+
+public:
+ kMerLite() {
+ clear();
+ };
+ kMerLite(kMer const &that) {
+ copy(that);
+ };
+ ~kMerLite() {
+ };
+
+ kMerLite &operator=(kMer const &that) {
+ copy(that);
+ return(*this);
+ };
+
+ char *merToString(uint32 merSize, char *instr) const {
+ uint32 lastWord = merSize >> 5;
+ char *str = instr;
+
+ if ((merSize & uint32MASK(6)) == 0)
+ lastWord++;
+
+ // We build the string right to left, print any partial word
+ // first, then print whole words until we run out of words to
+ // print.
+
+ if (merSize & uint32MASK(5)) {
+ ::merToString(merSize & uint32MASK(5), _wd[lastWord], str);
+ str += merSize & uint32MASK(5);
+ }
+
+ while (lastWord > 0) {
+ lastWord--;
+ ::merToString(32, _wd[lastWord], str);
+ str += 32;
+ }
+
+ return(instr);
+ };
+
+#if KMER_WORDS == 1
+
+ bool operator!=(kMerLite const &r) const { return(_wd[0] != r._wd[0]); };
+ bool operator==(kMerLite const &r) const { return(_wd[0] == r._wd[0]); };
+ bool operator<(kMerLite const &r) const { return(_wd[0] < r._wd[0]); };
+ bool operator>(kMerLite const &r) const { return(_wd[0] > r._wd[0]); };
+ bool operator<=(kMerLite const &r) const { return(_wd[0] <= r._wd[0]); };
+ bool operator>=(kMerLite const &r) const { return(_wd[0] >= r._wd[0]); };
+
+#else
+
+ bool operator!=(kMerLite const &r) const {
+ uint64 res = uint64ZERO;
+ for (uint32 i=KMER_WORDS; i--; )
+ res |= _wd[i] ^ r._wd[i];
+ return(res != uint64ZERO);
+ };
+ bool operator==(kMerLite const &r) const {
+ uint64 res = uint64ZERO;
+ for (uint32 i=KMER_WORDS; i--; )
+ res |= _wd[i] ^ r._wd[i];
+ return(res == uint64ZERO);
+ };
+
+ bool operator<(kMerLite const &r) const {
+ for (uint32 i=KMER_WORDS; i--; ) {
+ if (_wd[i] < r._wd[i]) return(true);
+ if (_wd[i] > r._wd[i]) return(false);
+ }
+ return(false);
+ };
+ bool operator>(kMerLite const &r) const {
+ for (uint32 i=KMER_WORDS; i--; ) {
+ if (_wd[i] > r._wd[i]) return(true);
+ if (_wd[i] < r._wd[i]) return(false);
+ }
+ return(false);
+ };
+ bool operator<=(kMerLite const &r) const {
+ for (uint32 i=KMER_WORDS; i--; ) {
+ if (_wd[i] < r._wd[i]) return(true);
+ if (_wd[i] > r._wd[i]) return(false);
+ }
+ return(true);
+ };
+ bool operator>=(kMerLite const &r) const {
+ for (uint32 i=KMER_WORDS; i--; ) {
+ if (_wd[i] > r._wd[i]) return(true);
+ if (_wd[i] < r._wd[i]) return(false);
+ }
+ return(true);
+ };
+#endif
+
+private:
+ uint64 _wd[KMER_WORDS];
+};
+
+
+#endif // KMER_LITE_H
diff --git a/meryl/test/stupidcount.C b/meryl/test/stupidcount.C
new file mode 100644
index 0000000..935096e
--- /dev/null
+++ b/meryl/test/stupidcount.C
@@ -0,0 +1,38 @@
+#include "bio++.H"
+
+// Reads a sequence file, outputs a list of the mers in it. You can
+// then pipe this to unix sort and uniq to do a mercount. You
+// probably don't want to count large things this way...
+
+int
+main(int argc, char **argv) {
+ char *seqName = 0L;
+ uint32 merSize = 20;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-s") == 0) {
+ seqName = argv[++arg];
+ } else if (strcmp(argv[arg], "-m") == 0) {
+ merSize = strtouint32(argv[++arg], 0L);
+ }
+ arg++;
+ }
+
+ if (seqName == 0L) {
+ fprintf(stderr, "usage: %s [-m mersize] -s seqfile.fasta\n", argv[0]);
+ exit(1);
+ }
+
+ seqStream *CS = new seqStream(seqName, true);
+ merStream *MS = new merStream(new kMerBuilder(merSize), CS);
+ char str[1024];
+
+ while (MS->nextMer())
+ fprintf(stdout, "%s\n", MS->theFMer().merToString(str));
+
+ delete MS;
+ delete CS;
+
+ exit(0);
+}
diff --git a/meryl/test/test-seq1.fasta b/meryl/test/test-seq1.fasta
new file mode 100644
index 0000000..4e2c60d
--- /dev/null
+++ b/meryl/test/test-seq1.fasta
@@ -0,0 +1,8 @@
+> 1 A 20 CG 0 T
+AAAAAAAAAAAAAAAAAAAAGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCG
+> 1 A 1 CG 0 T ----not-a-mer------ -----is-a-mer-------
+AAAAAAAAAAAAAAAAAAAANGCGCGCGCGCGCGCGCGCGNCGCGCGCGCGCGCGCGCGCG
+> (zero 63 bases)
+ATNCGGATYCGATCGASCHJAGSVHYWERIGHWEEIRVHSDKFVHWIERVHIWRVHKSDFVKS
+> 20 T
+NNNNTTTTTTTTTTTTTTTTTTTTNNNNTTTTTTTTTTTTTTTTTTTTNNNN
diff --git a/meryl/test/test-seq2.fasta b/meryl/test/test-seq2.fasta
new file mode 100644
index 0000000..4d53253
--- /dev/null
+++ b/meryl/test/test-seq2.fasta
@@ -0,0 +1,18 @@
+> 2 A 20 CG 0 T
+AAAAAAAAAAAAAAAAAAAAN
+AAAAAAAAAAAAAAAAAAAAN
+AAAAAAAAAAAAAAAAAAAAN
+AAAAAAAAAAAAAAAAAAAAN
+AAAAAAAAAAAAAAAAAAAAN
+AAAAAAAAAAAAAAAAAAACN
+AAAAAAAAAAAAAAAAAAACN
+AAAAAAAAAAAAAAAAAAACN
+AAAAAAAAAAAAAAAAAAACN
+AAAAAAAAAAAAAAAAAAACN
+AAAAAAAAAAAAAAAAAAAAAAAGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCG
+> 1 A 1 CG 0 T ----not-a-mer------ -----is-a-mer-------
+AAAAAAAAAAAAAAAAAAAANGCGCGCGCGCGCGCGCGCGNCGCGCGCGCGCGCGCGCGCG
+> (zero 63 bases)
+ATNCGGATYCGATCGASCHJAGSVHYWERIGHWEEIRVHSDKFVHWIERVHIWRVHKSDFVKS
+> 20 T
+NNNNTTTTTTTTTTTTTTTTTTTTNNNNTTTTTTTTTTTTTTTTTTTTNNNN
diff --git a/meryl/test/test-seq3.fasta b/meryl/test/test-seq3.fasta
new file mode 100644
index 0000000..5b42b3e
--- /dev/null
+++ b/meryl/test/test-seq3.fasta
@@ -0,0 +1,2 @@
+>
+ACGCTCAGCTACTACGACTTAGAGAAAATAGCGATATAGCGATCGATCGATTAGAGA
diff --git a/meryl/testPositionBias.C b/meryl/testPositionBias.C
new file mode 100644
index 0000000..b49d8e2
--- /dev/null
+++ b/meryl/testPositionBias.C
@@ -0,0 +1,117 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "bio++.H"
+#include "libmeryl.H"
+
+
+uint32 *
+collectCounts(char *name, uint32 base) {
+ merylStreamReader *A = new merylStreamReader(name);
+ uint32 *C = new uint32 [4*4*4];
+ char S[32];
+ uint32 code = 0;
+
+ for (uint32 i=0; i<4*4*4; i++)
+ C[i] = 0;
+
+ while (A->nextMer()) {
+ A->theFMer().merToString(S);
+
+ code = 0;
+ code |= letterToBits[S[base]];
+ code <<= 2;
+ code |= letterToBits[S[base+1]];
+ code <<= 2;
+ code |= letterToBits[S[base+2]];
+
+ C[code] += A->theCount();
+
+ kMer R = A->theFMer();
+ R.reverseComplement();
+ R.merToString(S);
+
+ code = 0;
+ code |= letterToBits[S[base]];
+ code <<= 2;
+ code |= letterToBits[S[base+1]];
+ code <<= 2;
+ code |= letterToBits[S[base+2]];
+
+ C[code] += A->theCount();
+ }
+
+ delete A;
+
+ return(C);
+}
+
+
+void
+showBias(uint32 base=5) {
+ uint32 *A = collectCounts("CNPT3", base);
+ uint32 *B = collectCounts("25.errorless", base);
+ uint32 *C = collectCounts("25.errorless.simulated", base);
+
+ for (uint32 i=0; i<4*4*4; i++) {
+ double bp = 0.0;
+ double cp = 0.0;
+
+ if (A[i] > 0) {
+ bp = (double)B[i] / (double)A[i];
+ cp = (double)C[i] / (double)A[i];
+ }
+
+ fprintf(stdout, "%c%c%c "uint32FMTW(3)" A "uint32FMTW(6)" B "uint32FMTW(6)" %.5f C "uint32FMTW(6)" %.5f\n",
+ bitsToLetter[(i >> 4) & 0x00000003],
+ bitsToLetter[(i >> 2) & 0x00000003],
+ bitsToLetter[(i >> 0) & 0x00000003],
+ i,
+ A[i],
+ B[i],
+ bp,
+ C[i],
+ cp);
+ }
+}
+
+
+double
+computeRMSD(uint32 base) {
+ uint32 *A = collectCounts("CNPT3", base);
+ uint32 *B = collectCounts("25.errorless", base);
+ uint32 *C = collectCounts("25.errorless.simulated", base);
+
+ double rmsd = 0;
+
+ for (uint32 i=0; i<4*4*4; i++) {
+ double bp = 0.0;
+ double cp = 0.0;
+
+ if (A[i] > 0) {
+ bp = (double)B[i] / (double)A[i];
+ cp = (double)C[i] / (double)A[i];
+ }
+
+ rmsd += (bp - cp) * (bp - cp);
+ }
+
+ rmsd /= 4*4*4;
+
+ return(sqrt(rmsd));
+}
+
+
+
+
+
+int
+main(int argc, char **argv) {
+
+ showBias(5);
+
+ //for (uint32 i=0; i<23; i++)
+ // fprintf(stdout, "rmsd "uint32FMTW(2)" %f\n", i, computeRMSD(i));
+}
diff --git a/meryl/unaryOp.C b/meryl/unaryOp.C
new file mode 100644
index 0000000..06c7446
--- /dev/null
+++ b/meryl/unaryOp.C
@@ -0,0 +1,59 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "meryl.H"
+#include "libmeryl.H"
+
+
+void
+unaryOperations(merylArgs *args) {
+
+ if (args->mergeFilesLen != 1) {
+ fprintf(stderr, "ERROR - must have exactly one file!\n");
+ exit(1);
+ }
+ if (args->outputFile == 0L) {
+ fprintf(stderr, "ERROR - no output file specified.\n");
+ exit(1);
+ }
+ if ((args->personality != PERSONALITY_LEQ) &&
+ (args->personality != PERSONALITY_GEQ) &&
+ (args->personality != PERSONALITY_EQ)) {
+ fprintf(stderr, "ERROR - only personalities lessthan, lessthanorequal,\n");
+ fprintf(stderr, "ERROR - greaterthan, greaterthanorequal, and equal\n");
+ fprintf(stderr, "ERROR - are supported in unaryOperations().\n");
+ fprintf(stderr, "ERROR - this is a coding error, not a user error.\n");
+ exit(1);
+ }
+
+ // Open the input and output files -- we don't know the number
+ // unique, distinct, and total until after the operation, so we
+ // leave them zero.
+ //
+ merylStreamReader *R = new merylStreamReader(args->mergeFiles[0]);
+ merylStreamWriter *W = new merylStreamWriter(args->outputFile, R->merSize(), R->merCompression(), R->prefixSize(), R->hasPositions());
+
+ switch (args->personality) {
+ case PERSONALITY_LEQ:
+ while (R->nextMer())
+ if (R->theCount() <= args->desiredCount)
+ W->addMer(R->theFMer(), R->theCount(), R->thePositions());
+ break;
+
+ case PERSONALITY_GEQ:
+ while (R->nextMer())
+ if (R->theCount() >= args->desiredCount)
+ W->addMer(R->theFMer(), R->theCount(), R->thePositions());
+ break;
+
+ case PERSONALITY_EQ:
+ while (R->nextMer())
+ if (R->theCount() == args->desiredCount)
+ W->addMer(R->theFMer(), R->theCount(), R->thePositions());
+ break;
+ }
+
+ delete R;
+ delete W;
+}
diff --git a/seagen/Make.include b/seagen/Make.include
new file mode 100644
index 0000000..334a6d5
--- /dev/null
+++ b/seagen/Make.include
@@ -0,0 +1,54 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBMERYL/ :=$(realpath $/../libmeryl/)/
+LIBKMER/ :=$(realpath $/../libkmer/)/
+LIBSIM4/ :=$(realpath $/../libsim4/)/
+
+src := $/searchGENOME.C \
+ $/configuration.C \
+ $/encodedQuery.C \
+ $/thr-deadlock.C \
+ $/thr-loader.C \
+ $/thr-search.C \
+ $/thr-output.C \
+ $/hitMatrix-sort.C \
+ $/aHit.H \
+ $/aHit.C \
+ $/hitMatrix.H \
+ $/posix.H \
+ $/searchGENOME.H
+src_C := $(filter %.C,${src})
+other_C := $/hitConverter.C \
+ $/filterEST.C \
+ $/filterEST-complicated.C \
+ $/filterMRNA.C \
+ $/filterNULL.C \
+ $/sortHits.C \
+ $/filtertest.C \
+ $/hitReader.C
+
+$/.CXX_SRCS := ${src_C} ${other_C} $/hitMatrix.C
+$/.CXX_EXES := $/seagen \
+ $/hitConverter \
+ $/filterEST $/filterMRNA $/filterNULL $/filtertest \
+ $/sortHits $/filterESTsimple
+
+$/.CLEAN :=$/*.o
+
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBKMER/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
+
+$/seagen: $/hitMatrix.o
+
+$/seagen: ${src_C:.C=.o} \
+ ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$/hitConverter: $/hitConverter.o $/aHit.o ${LIBUTL/}libutil.a
+$/filterEST: $/filterEST.o $/filterEST-complicated.o $/hitReader.o $/aHit.o ${LIBUTL/}libutil.a
+$/filterESTsimple: $/filterESTsimple.o $/hitReader.o $/aHit.o ${LIBUTL/}libutil.a
+$/filterMRNA: $/filterMRNA.o $/hitReader.o $/aHit.o ${LIBUTL/}libutil.a
+$/filterNULL: $/filterNULL.o $/hitReader.o $/aHit.o ${LIBUTL/}libutil.a
+$/sortHits: $/sortHits.o $/aHit.o ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/filtertest: $/filtertest.o
diff --git a/seagen/aHit.C b/seagen/aHit.C
new file mode 100644
index 0000000..c24775a
--- /dev/null
+++ b/seagen/aHit.C
@@ -0,0 +1,119 @@
+#include "aHit.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+
+void ahit_writeBinary(aHit *a, FILE *F) {
+ fwrite(a, sizeof(aHit), 1, F);
+}
+
+void ahit_readBinary(aHit *a, FILE *F) {
+ fread(a, sizeof(aHit), 1, F);
+}
+
+void ahit_readBinary(aHit *a, readBuffer *F) {
+ F->read((char *)a, sizeof(aHit));
+}
+
+void ahit_printASCII(aHit *a, FILE *F) {
+ fprintf(F, "-%c -e "uint32FMT" -D "uint32FMT" "uint32FMT" "uint32FMT" -M "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ a->_forward ? 'f' : 'r',
+ a->_qsIdx,
+ a->_dsIdx,
+ a->_dsLo,
+ a->_dsHi,
+ a->_covered,
+ a->_matched,
+ a->_numMers);
+}
+
+
+// We don't read the string here so that we can use a static buffer
+// in whatever loop we read with.
+//
+// e.g.,
+//
+// char b[1025];
+// while (!feof(I)) {
+// fgets(b, 1024, I);
+// if (!feof(I))
+// ahit_parseString(a, b);
+// }
+//
+// Note that using sscanf, while easy to implement, and safe
+// (looking, anyways), is terribly slow, and not really that safe.
+//
+// char c;
+// sscanf(b, "-%c -e %d -D %d %d %d -M %d %d %d",
+// &c,
+// &a->_qsIdx,
+// &a->_dsIdx,
+// &a->_dsLo,
+// &a->_dsHi,
+// &a->_covered,
+// &a->_matched,
+// &a->_numMers);
+// a->_direction = (c == 'f');
+//
+// fast: 138.440u 40.500s 4:04.61 73.1% 0+2k 327822+0io 4pf+0w
+// slow: 737.587u 38.652s 13:12.42 97.9% 0+2k 328006+0io 11pf+0w
+//
+void ahit_parseString(aHit *a, char *b) {
+ char *c = b+1;
+
+ a->_forward = (*c == 'f');
+
+ c += 1;
+
+ if (c[2] != 'e') fprintf(stderr, "'%s' didn't get -e\n", b);
+
+ c += 4;
+ a->_qsIdx = (uint32)strtoul(c, &c, 10);
+
+ // If we get a "-D" next then we are reading search output,
+ // otherwise, we are (hopefully) reading seatac output.
+
+ if (c[2] == 'D') {
+
+ // searchGENOME format here!
+
+ c += 4;
+ a->_dsIdx = (uint32)strtoul(c, &c, 10);
+ a->_dsLo = (uint32)strtoul(c, &c, 10);
+ a->_dsHi = (uint32)strtoul(c, &c, 10);
+
+ if (c[2] == 'M') {
+ c += 4;
+ a->_covered = (uint32)strtoul(c, &c, 10);
+ a->_matched = (uint32)strtoul(c, &c, 10);
+ a->_numMers = (uint32)strtoul(c, &c, 10);
+ } else {
+ a->_covered = 0;
+ a->_matched = 0;
+ a->_numMers = 0;
+ }
+ } else {
+
+ // seatac format here!
+#if 0
+ fprintf(stderr, "seatac?\n");
+
+
+ // We make horrible use of variable names here -- covered and
+ // matched are the regions on the first sequence, and numMers
+ // is the "F" value.
+
+ a->_covered = (uint32)strtoul(c, &c, 10);
+ a->_matched = (uint32)strtoul(c, &c, 10);
+
+ c += 4;
+ a->_dsIdx = (uint32)strtoul(c, &c, 10);
+ a->_dsLo = (uint32)strtoul(c, &c, 10);
+ a->_dsHi = (uint32)strtoul(c, &c, 10);
+
+ c += 4;
+ a->_numMers = (uint32)strtoul(c, &c, 10);
+#endif
+ }
+}
diff --git a/seagen/aHit.H b/seagen/aHit.H
new file mode 100644
index 0000000..f277f74
--- /dev/null
+++ b/seagen/aHit.H
@@ -0,0 +1,30 @@
+#ifndef AHIT_H
+#define AHIT_H
+
+#include "bio.h"
+#include "util++.H"
+#include <stdio.h>
+
+struct aHit {
+ uint32 _forward : 1;
+ uint32 _merged : 1;
+ uint32 _qsIdx;
+ uint32 _dsIdx;
+ uint32 _dsLo;
+ uint32 _dsHi;
+ uint32 _covered;
+ uint32 _matched;
+ uint32 _numMers;
+};
+
+
+void ahit_writeBinary(aHit *a, FILE *F);
+void ahit_readBinary(aHit *a, FILE *F);
+
+void ahit_readBinary(aHit *a, readBuffer *F);
+
+void ahit_printASCII(aHit *a, FILE *F);
+void ahit_parseString(aHit *a, char *b);
+
+#endif // AHIT_H
+
diff --git a/seagen/analysis/dumpScores.pl b/seagen/analysis/dumpScores.pl
new file mode 100644
index 0000000..5c3bf20
--- /dev/null
+++ b/seagen/analysis/dumpScores.pl
@@ -0,0 +1,50 @@
+#!/usr/local/bin/perl
+
+#
+#
+#
+
+my $currentestid = -1;
+my $outprefix;
+my @outline;
+my $num = 0;
+
+my $maxscore = 0;
+
+while ($currentestid < 222439) {
+ $_ = <STDIN>;
+
+ my ($dir, $junk, $estid, $junk, $chr, $beg, $end, $junk, $s1, $s2, $sm, $junk, $i, $c) = split '\s+', $_;
+
+ if ($currentestid == $estid) {
+ push @outline, "$s1,$s2,$i,$c";
+ $maxscore = $sm;
+ $num++;
+ } else {
+ @outline = sort { $b <=> $a } @outline;
+
+ if (defined($outprefix)) {
+ print "$outprefix\t$maxscore\t$num\t";
+
+ my $a = $,;
+ my $b = $\;
+ $, = " ";
+ $\ = "\n";
+
+ print @outline;
+
+ $, = $a;
+ $\ = $b;
+ }
+
+ $currentestid = $estid;
+ $outprefix = "$estid";
+
+ undef @outline;
+
+ push @outline, "$s1,$s2,$i,$c";
+ $maxscore = $sm;
+ $num = 1;
+ }
+}
+
diff --git a/seagen/analysis/plotScoresSingly.pl b/seagen/analysis/plotScoresSingly.pl
new file mode 100644
index 0000000..6f710fe
--- /dev/null
+++ b/seagen/analysis/plotScoresSingly.pl
@@ -0,0 +1,71 @@
+#!/usr/local/bin/perl
+
+#
+# bzip2 -dc /raid/WORK/EMpaper/run1-nofiltering/all-scored-hits.sorted.bz2 | perl dumpScores.pl | perl plotScoresSingly.pl
+#
+
+use strict;
+$| = 1;
+
+my $tmppath = "/tmp";
+
+# First line was blank. Don't know why.
+#my $junk = <STDIN>;
+
+while (!eof(STDIN)) {
+ $_ = <STDIN>;
+
+ my ($estid, $maxscore, $numhits, @vals) = split '\s+', $_;
+
+ open(A, "> $tmppath/hits-$estid.dat");
+ open(B, "> $tmppath/iden-$estid.dat");
+ open(C, "> $tmppath/covr-$estid.dat");
+ foreach my $h (@vals) {
+ my ($a, $b, $i, $c) = split ',', $h;
+
+ $a /= $maxscore;
+ print A "$a\n";
+
+ $i /= 100.0;
+ print B "$i\n";
+
+ $c /= 100.0;
+ print C "$c\n";
+ }
+ close(C);
+ close(B);
+ close(A);
+
+ my $output = substr("0000000000$estid", -6, 6);
+ my $direct = substr("0000000000$estid", -6, 3);
+
+ print "$output\r";
+
+ system("mkdir $direct") if (! -d "$direct");
+
+ open(O, "> $tmppath/plot-$estid.gpl");
+ print O "set terminal pbm color\n";
+ print O "set output\n";
+ print O "set pointsize 0.5\n";
+ print O "set xtics 10\n";
+ #print O "set size 1.5,1.5\n";
+ print O "plot [-5:200][0.0:1.2] ";
+ print O " 0.95 notitle lt 0, ";
+ print O " 0.80 notitle lt 0, ";
+ print O " 0.50 notitle lt 0, ";
+ print O " \"$tmppath/hits-$estid.dat\" using 1 notitle with linespoints 1, ";
+ print O " \"$tmppath/covr-$estid.dat\" using 1 notitle with points 3, ";
+ print O " \"$tmppath/iden-$estid.dat\" using 1 notitle with points 2\n";
+ close(O);
+
+ my $cmd = "";
+
+ $cmd = "gnuplot $tmppath/plot-$estid.gpl | ppmtogif -quiet > $direct/$output.gif";
+ $cmd .= " && rm -f";
+ $cmd .= " $tmppath/plot-$estid.gpl";
+ $cmd .= " $tmppath/hits-$estid.dat";
+ $cmd .= " $tmppath/iden-$estid.dat";
+ $cmd .= " $tmppath/covr-$estid.dat";
+ system("$cmd");
+}
+close(STDIN);
diff --git a/seagen/configuration.C b/seagen/configuration.C
new file mode 100644
index 0000000..2cf9143
--- /dev/null
+++ b/seagen/configuration.C
@@ -0,0 +1,345 @@
+#include "searchGENOME.H"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+configuration::configuration(void) {
+
+ _beVerbose = false;
+
+ _merSize = 20;
+ _merSkip = 0;
+ _numSearchThreads = 4;
+
+ _doReverse = true;
+ _doForward = true;
+
+ _maxDiagonal = 25;
+ _maxGap = 0;
+ _qsOverlap = 15;
+ _dsOverlap = 15;
+
+ // Alternate match extension scheme
+ _extendWeight = 0;
+ _extendMinimum = 0;
+ _extendAlternate = false;
+
+ _maxIntronLength = 1000000000;
+
+ _smallSequenceCutoff = 0;
+
+ _minLengthSingle = 0;
+ _minCoverageSingle = 0.0;
+ _minLengthMultiple = 0;
+ _minCoverageMultiple = 0.0;
+
+ _dbFileName = 0L;
+ _qsFileName = 0L;
+ _maskFileName = 0L;
+ _onlyFileName = 0L;
+ _outputFileName = 0L;
+ _queryMatchFileName = 0L;
+
+ _outputFile = STDOUT_FILENO;
+ _matchCountsFile = -1;
+
+ _tableTemporaryFileName = 0L;
+ _tableFileName = 0L;
+ _tableBuildOnly = false;
+
+ _binaryOutput = false;
+
+ _qsFASTA = 0L;
+ _maskDB = 0L;
+ _onlyDB = 0L;
+ _positions = 0L;
+
+ _numberOfQueries = 0;
+
+ _startTime = getTime();
+ _initTime = _startTime;
+ _buildTime = _startTime;
+ _searchTime = _startTime;
+
+ _loaderQueue = 16 * 1024;
+ _loaderSleep.tv_sec = 1;
+ _loaderSleep.tv_nsec = 0;
+ _loaderWarnings = false;
+
+ _searchSleep.tv_sec = 0;
+ _searchSleep.tv_nsec = 10000000;
+
+ _writerQueue = 32 * 1024;
+ _writerSleep.tv_sec = 1;
+ _writerSleep.tv_nsec = 0;
+ _writerWarnings = false;
+}
+
+configuration::~configuration() {
+
+ if (_beVerbose) {
+ uint32 nq = _qsFASTA->getNumberOfSequences();
+ double tm = _searchTime - _buildTime;
+ fprintf(stderr, "\n"uint32FMTW(7)" sequences in %5.2f seconds, %8.3f per second.\n", nq, tm, nq/tm);
+ }
+
+ errno = 0;
+ close(_outputFile);
+ close(_matchCountsFile);
+ if (errno)
+ fprintf(stderr, "Couldn't close to the output file '%s': %s\n", config._outputFileName, strerror(errno));
+
+ delete _qsFASTA;
+ delete _maskDB;
+ delete _onlyDB;
+ delete _positions;
+}
+
+static char const *usageString =
+"usage: %s [options]\n"
+"\n"
+"Algorithm Options:\n"
+" -mersize k Use k-mers\n"
+" -merskip j Skip j mers between each mer inserted into table\n"
+" -forward Search only the normal query sequences\n"
+" -reverse Search only the reverse-complemented query sequences\n"
+" -maxdiagonal d\n"
+" -maxgap g\n"
+" -qoverlap q\n"
+" -doverlap d\n"
+" -maxintron m\n"
+" -smallsequence\n"
+" -singlelength l\n"
+" -singlecoverage c\n"
+" -multiplelength l\n"
+" -multiplecoverage c\n"
+" -extendweight w\n"
+" -extendminimum m\n"
+"\n"
+"Process Options\n"
+" -numthreads n Use n search threads\n"
+"\n"
+" -loaderqueue h Size of the loader queue\n"
+" -loadersleep t Time the loader will sleep when its output queue is full\n"
+" -loaderwarnings Enable warning messages for the loader\n"
+"\n"
+" -searchsleep t Time the searcher will sleep when it has no input\n"
+"\n"
+" -writerqueue h Size of the output queue\n"
+" -writersleep t Time the writer will sleep when it has nothing to write\n"
+" -writerwarnings Enable warning messages for the writer\n"
+"\n"
+"\n"
+" -buildtables datfile If 'datfile' doesn't exist, build the tables, write\n"
+" them to 'datfile' and exit.\n"
+" -usetables datfile Load the tables from 'datfile' file and do the compute.\n"
+" If 'datfile' doesn't exist, an implicit -buildtables is\n"
+" performed.\n"
+"Input Options:\n"
+" -mask f Ignore all mers listed in file f\n"
+" -only f Use only the mers listed in file f\n"
+" -cdna c.fasta Query sequences (the cDNA, the stream)\n"
+" -stream An alias for -cdna\n"
+" -genomic g.fasta Database sequences (the genome, the table)\n"
+" -table An alias for -genomic)\n"
+" -use #,#,#,# using only those sequences specified\n"
+" -use file using only those sequences listed in the file\n"
+"\n"
+"Output Options\n"
+" -verbose Entertain the user\n"
+" -binary Write the hits in a binary format\n"
+" -output f Write output to file f\n"
+" -count f Write counts of hits to file f\n";
+
+void
+configuration::usage(char *name) {
+ fprintf(stderr, usageString, name);
+}
+
+
+
+void
+configuration::read(int argc, char **argv) {
+ int arg = 1;
+
+ if (argc < 2) {
+ usage(argv[0]);
+ exit(1);
+ }
+
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-mersize") == 0) {
+ arg++;
+ _merSize = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-merskip") == 0) {
+ arg++;
+ _merSkip = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-numthreads") == 0) {
+ arg++;
+ _numSearchThreads = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-mask") == 0) {
+ arg++;
+ _maskFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-only") == 0) {
+ arg++;
+ _onlyFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-cdna") == 0) {
+ arg++;
+ _qsFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-stream") == 0) {
+ arg++;
+ _qsFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-genomic") == 0) {
+ arg++;
+ _dbFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-table") == 0) {
+ arg++;
+ _dbFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-buildtemporary") == 0) {
+ arg++;
+ _tableTemporaryFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-buildtables") == 0) {
+ arg++;
+ _tableFileName = argv[arg];
+ _tableBuildOnly = true;
+ } else if (strcmp(argv[arg], "-usetables") == 0) {
+ arg++;
+ _tableFileName = argv[arg];
+ _tableBuildOnly = false;
+ } else if (strcmp(argv[arg], "-positions") == 0) {
+ arg++;
+ _tableFileName = argv[arg];
+ _tableBuildOnly = false;
+ } else if (strcmp(argv[arg], "-forward") == 0) {
+ _doForward = true;
+ _doReverse = false;
+ } else if (strcmp(argv[arg], "-reverse") == 0) {
+ _doReverse = true;
+ _doForward = false;
+ } else if (strcmp(argv[arg], "-verbose") == 0) {
+ _beVerbose = true;
+ } else if (strcmp(argv[arg], "-binary") == 0) {
+ _binaryOutput = true;
+ } else if (strcmp(argv[arg], "-output") == 0) {
+ arg++;
+ _outputFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-count") == 0) {
+ arg++;
+ _queryMatchFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-maxdiagonal") == 0) {
+ arg++;
+ _maxDiagonal = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-maxgap") == 0) {
+ arg++;
+ _maxGap = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-qoverlap") == 0) {
+ arg++;
+ _qsOverlap = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-doverlap") == 0) {
+ arg++;
+ _dsOverlap = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-maxintron") == 0) {
+ arg++;
+ _maxIntronLength = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-smallsequence") == 0) {
+ arg++;
+ _smallSequenceCutoff = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-singlelength") == 0) {
+ arg++;
+ _minLengthSingle = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-multiplelength") == 0) {
+ arg++;
+ _minLengthMultiple = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-singlecoverage") == 0) {
+ arg++;
+ _minCoverageSingle = atof(argv[arg]);
+ } else if (strcmp(argv[arg], "-multiplecoverage") == 0) {
+ arg++;
+ _minCoverageMultiple = atof(argv[arg]);
+ } else if (strncmp(argv[arg], "-extendweight", 7) == 0) {
+ arg++;
+ _extendWeight = atoi(argv[arg]);
+ _extendAlternate = true;
+ } else if (strncmp(argv[arg], "-extendminimum", 7) == 0) {
+ arg++;
+ _extendMinimum = atoi(argv[arg]);
+ _extendAlternate = true;
+
+ } else if (strncmp(argv[arg], "-loaderqueue", 8) == 0) {
+ _loaderQueue = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-loadersleep", 8) == 0) {
+ setTime(&_loaderSleep, atof(argv[++arg]));
+ } else if (strncmp(argv[arg], "-loaderwarnings", 8) == 0) {
+ _loaderWarnings = true;
+
+ } else if (strncmp(argv[arg], "-searchsleep", 8) == 0) {
+ setTime(&_searchSleep, atof(argv[++arg]));
+
+ } else if (strncmp(argv[arg], "-writerqueue", 8) == 0) {
+ _writerQueue = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-writersleep", 8) == 0) {
+ setTime(&_writerSleep, atof(argv[++arg]));
+ } else if (strncmp(argv[arg], "-writerwarnings", 8) == 0) {
+ _writerWarnings = true;
+
+ } else {
+ fprintf(stderr, "Unknown option '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ //
+ // Make sure some constraints are met
+ //
+
+ if (_maskFileName && _onlyFileName) {
+ fprintf(stderr, "ERROR: At most one of -mask and -only may be used.\n");
+ exit(1);
+ }
+
+ //
+ // Check that the mers are at least adjacent
+ //
+ if (_merSkip >= _merSize) {
+ fprintf(stderr, "ERROR: Mers are not adjacent; make sure merskip <= mersize.\n");
+ exit(1);
+ }
+
+ // Fail if we don't get reasonable signal criteria
+ //
+ if (((_minLengthSingle == 0) && (_minCoverageSingle == 0.0)) ||
+ ((_minLengthMultiple == 0) && (_minCoverageMultiple == 0.0)))
+ fprintf(stderr, "WARNING: Minimum match lengths not specified. All matches will be reported.\n");
+
+
+ // Open output file
+ //
+ if (_outputFileName) {
+ errno = 0;
+ _outputFile = open(_outputFileName,
+ O_WRONLY | O_LARGEFILE | O_CREAT | O_TRUNC,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno) {
+ fprintf(stderr, "Couldn't open the output file '%s'?\n%s\n", _outputFileName, strerror(errno));
+ exit(1);
+ }
+ }
+
+
+ if (_queryMatchFileName) {
+ errno = 0;
+ _matchCountsFile = open(_queryMatchFileName,
+ O_WRONLY | O_LARGEFILE | O_CREAT | O_TRUNC,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno) {
+ fprintf(stderr, "Couldn't open the match counts file '%s'?\n%s\n", _queryMatchFileName, strerror(errno));
+ exit(1);
+ }
+ }
+
+
+ // Gotta go somewhere!
+ //
+ _startTime = getTime();
+}
diff --git a/seagen/configuration.H b/seagen/configuration.H
new file mode 100644
index 0000000..b550105
--- /dev/null
+++ b/seagen/configuration.H
@@ -0,0 +1,107 @@
+#ifndef CONFIGURATION_H
+#define CONFIGURATION_H
+
+//
+// A singleton for working with the command line parameters.
+//
+
+#include "bio++.H"
+#include "util++.H"
+
+class configuration {
+public:
+ bool _beVerbose;
+
+ uint32 _merSize;
+ uint32 _numSearchThreads;
+
+ uint32 _merSkip;
+
+ bool _doReverse;
+ bool _doForward;
+
+ uint32 _maxDiagonal;
+ uint32 _maxGap;
+ uint32 _qsOverlap;
+ uint32 _dsOverlap;
+
+ uint32 _extendWeight;
+ uint32 _extendMinimum;
+ bool _extendAlternate;
+
+ uint32 _maxIntronLength;
+
+ uint32 _smallSequenceCutoff;
+
+ uint32 _minLengthSingle;
+ double _minCoverageSingle;
+
+ double _minCoverageMultiple;
+ uint32 _minLengthMultiple;
+
+ char *_dbFileName;
+ char *_qsFileName;
+ char *_maskFileName;
+ char *_onlyFileName;
+ char *_outputFileName;
+ char *_queryMatchFileName;
+
+ int _outputFile;
+ int _matchCountsFile;
+
+ char *_tableTemporaryFileName;
+
+ char *_tableFileName;
+ bool _tableBuildOnly;
+
+ bool _binaryOutput;
+
+ seqStream *_dbSTREAM;
+
+ // Running state
+ //
+ seqCache *_qsFASTA;
+ existDB *_maskDB;
+ existDB *_onlyDB;
+ positionDB *_positions;
+
+ uint32 _numberOfQueries;
+
+ // Wall clock times
+ //
+ double _startTime;
+ double _initTime;
+ double _buildTime;
+ double _searchTime;
+
+ // sweatShop queue sizes; maximum number of sequences pre-loaded,
+ // maximum number of results waiting for output.
+ //
+ uint32 _loaderQueue;
+ struct timespec _loaderSleep;
+ bool _loaderWarnings;
+
+ struct timespec _searchSleep;
+
+ uint32 _writerQueue;
+ struct timespec _writerSleep;
+ bool _writerWarnings;
+
+ configuration();
+ ~configuration();
+
+ void usage(char *name);
+ void read(int argc, char **argv);
+
+ void setTime(struct timespec *ts, double t) {
+ ts->tv_sec = (time_t)floor(t);
+ ts->tv_nsec = (long)((t - ts->tv_sec) * 1e9);
+ };
+private:
+ void addToUse(uint32 v);
+ void parseUseLine(char *line);
+};
+
+extern configuration config;
+
+#endif // CONFIGURATION_H
diff --git a/seagen/encodedQuery.C b/seagen/encodedQuery.C
new file mode 100644
index 0000000..e029dd0
--- /dev/null
+++ b/seagen/encodedQuery.C
@@ -0,0 +1,231 @@
+#include <new>
+
+#include "encodedQuery.H"
+
+encodedQuery::encodedQuery(seqInCore *S,
+ uint32 k) {
+
+ _iid = S->getIID();
+ _sequenceLength = S->sequenceLength();
+ _merSize = k;
+ _mersTotal = 0;
+ _mersAvail = 0;
+ _mers = 0L;
+ _skip = 0L;
+ _numberOfResults = 0;
+ _output = 0L;
+ _outputLen = 0;
+ _outputMax = 0;
+
+ if (k > _sequenceLength)
+ return;
+
+ char const *seq = S->sequence();
+ uint32 seqLen = S->sequenceLength();
+
+ _mersTotal = seqLen - k + 1;
+ _mersAvail = 0;
+ _mers = new uint64 [_mersTotal];
+ _skip = new uint8 [_mersTotal];
+
+ uint64 substring = uint64ZERO;
+ uint64 mermask = uint64MASK(2 * k);
+ int32 timeUntilValid = k;
+
+ for (uint32 i=0; i<seqLen; ) {
+ substring <<= 2;
+ substring &= mermask;
+
+ if (letterToBits[seq[i]] != 0xff) {
+ substring |= letterToBits[ seq[i] ];
+ timeUntilValid--;
+ } else {
+ timeUntilValid = k;
+ }
+
+ i++;
+
+ if (i >= k) {
+ _mers[i-k] = substring;
+ _skip[i-k] = timeUntilValid > 0;
+ _mersAvail += 1 - _skip[i-k];
+ }
+ }
+}
+
+encodedQuery::~encodedQuery() {
+ delete [] _mers;
+ delete [] _skip;
+ delete [] _output;
+}
+
+
+
+
+void
+encodedQuery::test(seqInCore *S) {
+
+ // We assume we've been initialized with the forward version!
+
+ uint32 k = _merSize;
+
+ char const *seq = S->sequence();
+ uint32 seqLen = S->sequenceLength();
+
+ uint64 substring = uint64ZERO;
+ uint64 mermask = uint64MASK(2 * k);
+ int32 timeUntilValid = k;
+
+ // Compute the complement version; we'll iterate through all data
+ // in us, comparing against what the original method would say.
+
+ uint32 _r_mersAvail = 0;
+ uint64 *_r_mers = new uint64 [_mersTotal];
+ uint8 *_r_skip = new uint8 [_mersTotal];
+
+ substring = uint64ZERO;
+ mermask = uint64MASK(2 * k);
+ timeUntilValid = k;
+
+ for (uint32 i=0; i<seqLen; ) {
+ substring <<= 2;
+ substring &= mermask;
+
+ if (letterToBits[seq[seqLen - 1 - i]] != 0xff) {
+ substring |= letterToBits[ complementSymbol[ seq[seqLen - 1 - i] ]];
+ timeUntilValid--;
+ } else {
+ timeUntilValid = k;
+ }
+
+ i++;
+
+ if (i >= k) {
+ _r_mers[i-k] = substring;
+ _r_skip[i-k] = timeUntilValid > 0;
+ _r_mersAvail += 1 - _r_skip[i-k];
+ }
+ }
+
+#if 0
+ // For comparison, this is the original code used to compute the
+ // reverse complement mers.
+
+ for (uint32 i=0; i<seqLen; ) {
+ substring <<= 2;
+ substring &= mermask;
+
+ if (validSymbol[seq[seqLen - 1 - i]]) {
+ substring |= compressSymbol[ complementSymbol[ seq[seqLen - 1 - i] ]];
+ timeUntilValid--;
+ } else {
+ timeUntilValid = k;
+ }
+
+ i++;
+
+ if (i >= k) {
+ _mers[i-k] = substring;
+ _skip[i-k] = timeUntilValid > 0;
+ _mersAvail += 1 - _skip[i-k];
+ }
+ }
+#endif
+
+
+
+
+
+ // CHECK!
+ //
+ if (_r_mersAvail != _mersAvail) {
+ fprintf(stderr, "encodedQuery::test()-- mersAvail incorrect: Recomputed:"uint32FMT" Real:"uint32FMT"\n", _mersAvail, _r_mersAvail);
+ }
+
+ char mer1[65];
+ char mer2[65];
+ bool fail = false;
+
+ for (uint32 i=0; i<_mersTotal; i++) {
+
+ if (getSkip(i, true) != _r_skip[i]) {
+ fprintf(stderr, "encodedQuery::test()-- skip["uint32FMTW(4)"] incorrect: Acc:%d Real:%d\n", i, getSkip(i, true), _r_skip[i]);
+ fail = true;
+ }
+
+ if (getSkip(i, true) == false) {
+ if (getMer(i, true) != _r_mers[i]) {
+ uint64ToMerString(_merSize, getMer(i, true), mer1);
+ uint64ToMerString(_merSize, _r_mers[i], mer2);
+ fprintf(stderr, "encodedQuery::test()-- mers["uint32FMTW(4)"] incorrect: Acc:"uint64HEX" %s Real:"uint64HEX" %s\n",
+ i,
+ getMer(i, true), mer1,
+ _r_mers[i], mer2);
+ fail = true;
+ }
+ }
+
+ if (fail) {
+ char rev[2048];
+ strcpy(rev, seq);
+ fprintf(stderr, "seq='%s'\n", seq);
+ fprintf(stderr, "rev='%s'\n", reverseComplementSequence(rev, seqLen));
+ exit(1);
+ }
+ }
+
+ //fprintf(stderr, "encodedQuery::test()-- %s\n", seq);
+ //fprintf(stderr, "encodedQuery::test()-- tested avail:"uint32FMT" total:"uint32FMT"\n", _mersAvail, _mersTotal);
+
+ delete [] _r_mers;
+ delete [] _r_skip;
+}
+
+
+void
+encodedQuery::addOutput(void *newout, uint32 size) {
+
+ // Allocate space for the output -- 1MB should be enough for about
+ // 29000 signals. Make it 32K -> 900 signals.
+ //
+ // You probably do not want to move this into the query
+ // constructor, as that will just waste a lot of memory with
+ // thousands of these in the input queue.
+ //
+ if (_output == 0L) {
+ _outputLen = 0;
+ _outputMax = 32 * 1024;
+ _output = new char [_outputMax];
+ }
+
+ if (_outputLen + 128 >= _outputMax) {
+ _outputMax <<= 1;
+ char *o = 0L;
+
+ try {
+ o = new char [_outputMax];
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "encodedQuery::addOutput()-- out of memory, tried to extend output string\n");
+ fprintf(stderr, "encodedQuery::addOutput()-- from "uint32FMT" to "uint32FMT" bytes.\n",
+ _outputLen, _outputMax);
+ exit(1);
+ }
+
+ memcpy(o, _output, _outputLen);
+ delete [] _output;
+ _output = o;
+ }
+
+ if (size > 0) {
+ memcpy(_output + _outputLen, newout, size);
+ _outputLen += size;
+ } else {
+ char *n = (char *)newout;
+
+ while (*n)
+ _output[_outputLen++] = *n++;
+ _output[_outputLen] = 0;
+ }
+
+ _numberOfResults++;
+};
diff --git a/seagen/encodedQuery.H b/seagen/encodedQuery.H
new file mode 100644
index 0000000..3ef3935
--- /dev/null
+++ b/seagen/encodedQuery.H
@@ -0,0 +1,120 @@
+#ifndef ENCODEDQUERY_H
+#define ENCODEDQUERY_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "bio++.H"
+#include "seqCache.H"
+
+class encodedQuery {
+public:
+ encodedQuery(seqInCore *S,
+ uint32 k);
+ ~encodedQuery();
+
+ void test(seqInCore *S);
+
+ uint32 IID(void) { return(_iid); };
+
+ uint32 numberOfMers(void) { return(_mersTotal); };
+ uint32 numberOfValidMers(void) { return(_mersAvail); };
+
+ // Return either an approximation or the exact number of bp covered by valid mers.
+ //
+ uint32 bpCovered(bool doCompute);
+ uint32 bpTotal(void);
+
+ uint64 getMer(uint32 i, bool isReverse);
+ bool getSkip(uint32 i, bool isReverse);
+ void setSkip(uint32 i, bool isReverse);
+
+ void addOutput(void *output, uint32 size);
+
+ char *theOutput(void) { return(_output); };
+ uint32 theOutputLength(void) { return(_outputLen); };
+ uint32 numberOfResults(void) { return(_numberOfResults); };
+private:
+ uint32 _iid;
+ uint32 _sequenceLength;
+
+ uint32 _merSize;
+
+ uint32 _mersTotal; // Number of mers possible in the query
+ uint32 _mersAvail; // Number of mers not masked out
+ uint64 *_mers; // List of mers
+ uint8 *_skip;
+
+ uint32 _numberOfResults;
+ char *_output;
+ uint32 _outputLen;
+ uint32 _outputMax;
+};
+
+
+inline
+uint32
+encodedQuery::bpCovered(bool doCompute) {
+ uint32 bp = numberOfValidMers();
+
+ if (doCompute) {
+ merCovering *IL = new merCovering(_merSize);
+
+ for (uint32 qi=0; qi<numberOfMers(); qi++) {
+ if (getSkip(qi, false) == false)
+ IL->addMer(qi);
+ }
+
+ bp = IL->sumOfLengths();
+ delete IL;
+ }
+
+ return(bp);
+}
+
+inline
+uint32
+encodedQuery::bpTotal(void) {
+ return(_sequenceLength);
+}
+
+
+// XXX: We need to extend get*() to also take a isReverse flag,
+// and to then return the reverse-complement mer, skip, etc.
+
+inline
+uint64
+encodedQuery::getMer(uint32 i, bool isReverse) {
+
+ if (isReverse) {
+ i = _mersTotal - i - 1;
+ return(reverseComplementMer(_merSize, _mers[i]));
+ } else {
+ return(_mers[i]);
+ }
+}
+
+inline
+bool
+encodedQuery::getSkip(uint32 i, bool isReverse) {
+
+ if (isReverse)
+ i = _mersTotal - i - 1;
+
+ return(_skip[i]);
+}
+
+inline
+void
+encodedQuery::setSkip(uint32 i, bool isReverse) {
+
+ if (isReverse)
+ i = _mersTotal - i - 1;
+
+ // If skip[i] is already set, don't adjust; otherwise, subtract
+ // one from the available.
+ _mersAvail -= 1 - _skip[i];
+ _skip[i] = 1;
+}
+
+
+#endif // ENCODEDQUERY_H
diff --git a/seagen/filterEST-complicated.C b/seagen/filterEST-complicated.C
new file mode 100644
index 0000000..a872e00
--- /dev/null
+++ b/seagen/filterEST-complicated.C
@@ -0,0 +1,279 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include "aHit.H"
+#include "hitReader.H"
+
+
+extern uint32 uniqThresh;
+extern uint32 reptThresh;
+extern FILE *logFile;
+extern bool decided;
+extern const char *label;
+extern uint32 hitsToSave;
+extern double qualToSave;
+
+double difference = 0.1;
+
+void
+complicatedFilter_1_unique(hitReader &HR) {
+
+ if (HR.numHits() <= uniqThresh) {
+ decided = true;
+ label = "unique";
+ hitsToSave = HR.numHits();
+ qualToSave = 0.0;
+
+ // Try being a little more aggressive. Search for the last
+ // point where the score difference across 10 hits is more than
+ // difference and use that for a limit.
+
+
+ // On the 100k test set #1 (ESTmapper paper, 1 Oct 2004) this modification
+ // results in:
+ // tp=106564 fp=1255487 fn=56705 tn=52816595
+ //
+ // compared to saving all hits:
+ // tp=106579 fp=1914659 fn=56690 tn=52157423
+ //
+ // That is, we lost 15 true matches and didn't polish 660,000
+ // matches -- 1.21% of the total, but 50% of what we actually
+ // need to polish.
+ //
+ uint32 i = HR.numHits() - 1;
+ while ((i >= 10) && ((HR[i-10].coverage - HR[i].coverage) < difference))
+ i--;
+
+ hitsToSave = HR.numHits();
+ qualToSave = HR[i].coverage;
+
+#if 0
+ // Take the middle hit, not the end. This doesn't hurt too much
+ // (20 matches out of 100,000 ESTs, and we missed one EST
+ // completely) but only gains us 0.08% additional filtering.
+ if (i >= 15)
+ qualToSave = HR[i-5].coverage;
+#endif
+
+ // Save all hits with this coverage score! This isn't really needed, but it
+ // makes the log message correct.
+ //
+ while ((i < HR.numHits()) && (qualToSave == HR[i].coverage))
+ i++;
+
+ if (logFile)
+ fprintf(logFile, uint32FMT"] unique: aggressively filtered to "uint32FMT" hits out of "uint32FMT" hits.\n",
+ HR.iid(), i, HR.numHits());
+ }
+}
+
+
+
+
+
+void
+complicatedFilter_2_knee(hitReader &HR) {
+
+ decided = false;
+ hitsToSave = 0;
+ qualToSave = 0.0;
+
+#if 0
+ decided = true;
+ hitsToSave = 0;
+ qualToSave = 1.1;
+ return;
+#endif
+
+ // Apply the same filter as used in #1 (the aggressive part), and accept
+ // it if the number of hits saved is below some threshold.
+
+ uint32 i = HR.numHits() - 1;
+ while ((i >= 10) && ((HR[i-10].coverage - HR[i].coverage) < difference))
+ i--;
+
+ // If i==9, then we failed to find a knee, and we fail this filter
+ //
+ if (i < 10)
+ return;
+
+ hitsToSave = HR.numHits();
+ qualToSave = HR[i].coverage;
+
+ // Save all hits with this coverage score!
+ //
+ while ((i < HR.numHits()) && (qualToSave == HR[i].coverage))
+ i++;
+
+ if (i <= uniqThresh) {
+ decided = true;
+ label = "knee";
+
+ if (logFile)
+ fprintf(logFile, uint32FMT"] knee: filtered "uint32FMT" hits down to "uint32FMT" hits using threshold %f\n",
+ HR.iid(), HR.numHits(), i, qualToSave);
+ }
+}
+
+
+
+// If all scores are about the same, it's either a repeat or a
+// lot of spurious matches, depending on the level of signal.
+//
+void
+complicatedFilter_3_uniform(hitReader &HR) {
+
+ decided = false;
+ hitsToSave = 0;
+ qualToSave = 0.0;
+
+ if ((HR.bestScore() - HR.worstScore()) < difference) {
+ decided = true;
+ label = "uniform";
+ hitsToSave = reptThresh;
+ qualToSave = 0.0;
+
+ if (logFile) {
+ fprintf(logFile, uint32FMT"] uniform: uniform signal strength, saving the first "uint32FMT" hits out of "uint32FMT" hits, best=%f, worst=%f\n",
+ HR.iid(), hitsToSave, HR.numHits(), HR.bestScore(), HR.worstScore());
+ }
+ }
+}
+
+
+
+
+// If we're not decided here, the EST had too many "good" hits to
+// be filtered by the threshold method. Try a more sophisticated
+// (confusing) method.
+//
+void
+complicatedFilter_4_largestdifference(hitReader &HR) {
+
+ decided = false;
+ hitsToSave = 0;
+ qualToSave = 0.0;
+
+ double largestDifference = 0.0;
+ for (uint32 i=1; i < HR.numHits(); i++)
+ if (largestDifference < (HR[i-1].coverage - HR[i].coverage))
+ largestDifference = HR[i-1].coverage - HR[i].coverage;
+
+ // If the largest difference is below 10% coverage, then it's not
+ // clear how to pick a threshold and we just save a bunch of hits.
+ //
+ if (largestDifference < difference) {
+ decided = true;
+ label = "diff";
+ hitsToSave = reptThresh;
+ qualToSave = 0.0;
+
+ if (logFile)
+ fprintf(logFile, uint32FMT"] diff: has no clear signal knee, saving the first "uint32FMT" hits out of "uint32FMT" hits, best=%f, worst=%f, largestdiff=%f\n",
+ HR.iid(), hitsToSave, HR.numHits(), HR.bestScore(), HR.worstScore(), largestDifference);
+ }
+}
+
+
+
+// Identify any spike near the start. If we see a spike,
+// save the first uniqThresh hits.
+//
+// If the largest difference (which we guarantee to be >= 10%
+// coverage here) is in the first uniqThresh hits, then we
+// have a spike and we output uniqThresh hits.
+//
+// To narrow the range more, we find the last spot where the
+// difference in scores over 10 hits is > difference. This is a
+// generous heuristic.
+//
+void
+complicatedFilter_5_spikes(hitReader &HR) {
+
+ decided = false;
+ hitsToSave = 0;
+ qualToSave = 0.0;
+
+ uint32 spikeFound = 0;
+ for (uint32 i=1; i < uniqThresh; i++)
+ if ((HR[i-1].coverage - HR[i].coverage) > difference)
+ spikeFound = i;
+
+ // If we have found a spike, start at hit[uniqThresh], search
+ // backwards for the first point where the difference in
+ // scores across 10 hits is larger than difference
+ //
+ // Seems like a NOP, but it loosens things up a bit. Consider a
+ // spike between hits 3 and 4, but 1=2=3 and 4=5=6=7=8=9=10=11. We
+ // find a spike, then find a nice place to cut it. If we never
+ // find a nice place, we save the top uniqThresh hits.
+
+ if (spikeFound) {
+ decided = true;
+ label = "spike";
+ hitsToSave = uniqThresh;
+ qualToSave = 0.0;
+
+ for (uint32 i=uniqThresh-1; i > 9; i--)
+ if ((HR[i-10].coverage - HR[i].coverage) > difference) {
+ hitsToSave = i + 1;
+ break;
+ }
+
+ qualToSave = HR[hitsToSave].coverage;
+
+ if (logFile)
+ fprintf(logFile, uint32FMT"] spike: at "uint32FMT", "uint32FMT" hits saved: thresh=%f, "uint32FMT" hits, best=%f, worst=%f\n",
+ HR.iid(), spikeFound, hitsToSave, qualToSave, HR.numHits(), HR.bestScore(), HR.worstScore());
+ }
+}
+
+
+
+
+void
+complicatedFilter(hitReader &HR) {
+ decided = false;
+ label = "NOLABELERROR";
+ qualToSave = 1.0;
+ hitsToSave = 0;
+
+ complicatedFilter_1_unique(HR);
+
+ if (decided)
+ return;
+
+ complicatedFilter_2_knee(HR);
+
+ if (decided)
+ return;
+
+ complicatedFilter_3_uniform(HR);
+
+ if (decided)
+ return;
+
+ complicatedFilter_4_largestdifference(HR);
+
+ if (decided)
+ return;
+
+ complicatedFilter_5_spikes(HR);
+
+ if (decided)
+ return;
+
+ decided = true;
+ label = "unknown";
+ hitsToSave = reptThresh;
+ qualToSave = 0.0;
+
+ if (hitsToSave > HR.numHits())
+ hitsToSave = HR.numHits();
+
+ if (logFile)
+ fprintf(logFile, uint32FMT"] is an unclassified signal, "uint32FMT" hits saved out of "uint32FMT" hits, best=%f, worst=%f\n",
+ HR.iid(), hitsToSave, HR.numHits(), HR.bestScore(), HR.worstScore());
+}
diff --git a/seagen/filterEST.C b/seagen/filterEST.C
new file mode 100644
index 0000000..25014f7
--- /dev/null
+++ b/seagen/filterEST.C
@@ -0,0 +1,275 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include "aHit.H"
+#include "hitReader.H"
+
+// XXX: Thread the filter! Really cool! Pretty neat hack! Usual
+// thing, a thread to read hits, n threads to do filtering, and a
+// thread to write filtered hits. Not trivial, but maybe a win.
+
+// Global statistics
+//
+uint32 hitsSaved = 0;
+uint32 hitsFiltered = 0;
+uint32 hitsUnknown = 0;
+uint32 hitsTotal = 0;
+
+uint32 seqsMapped = 0; // Sequences that we mapped
+uint32 seqsPartial = 0; // Sequences that we mapped, but missed a few good matches
+uint32 seqsMissed = 0; // Sequences that we failed to map, but should have
+
+uint32 filterTP = 0;
+uint32 filterFP = 0;
+uint32 filterFNfilt = 0; // false negatives from filtering
+uint32 filterFNunk = 0; // false negatives from our failure to classify
+uint32 filterTN = 0;
+
+uint32 goodPercentID = 94;
+uint32 goodCoverage = 50;
+
+// Command line options
+//
+uint32 uniqThresh = 200; // Used to be 100
+uint32 reptThresh = 200; // Used to be 100
+
+FILE *logFile = 0L;
+
+// Filter results -- thread unsafe!
+//
+// bool decided -- true if the filter could decide on how to filter the hits
+// char label -- if decided, the name of the decider
+// uint32 hitsToSave -- the number of hits to save
+// double qualToSave -- the quality threshold to filter at
+//
+bool decided;
+const char *label;
+uint32 hitsToSave;
+double qualToSave;
+
+
+
+
+void
+report(uint32 iid
+#ifdef WITH_ANSWERS
+ ,
+ uint32 filterTP,
+ uint32 filterFP,
+ uint32 filterFNfilt,
+ uint32 filterFNunk,
+ uint32 filterTN,
+ uint32 seqsMapped,
+ uint32 seqsPartial,
+ uint32 seqsMissed
+#endif
+ ) {
+
+ fprintf(stderr,
+ uint32FMTW(9)"]"
+#ifdef WITH_ANSWERS
+ " tp="uint32FMTW(7)" fp="uint32FMTW(7)" fnfilt="uint32FMTW(7)" fnunkn="uint32FMTW(7)" tn="uint32FMTW(7)
+ " yea:"uint32FMTW(7)" may:"uint32FMTW(7)" nay:"uint32FMTW(7)
+#endif
+ " hits saved:"uint32FMTW(8)"/"uint32FMTW(8)" = %6.3f%%\r",
+ iid,
+#ifdef WITH_ANSWERS
+ filterTP, filterFP, filterFNfilt, filterFNunk, filterTN,
+ seqsMapped, seqsPartial, seqsMissed,
+#endif
+ hitsSaved, hitsTotal,
+ 100.0 * hitsSaved / hitsTotal);
+}
+
+
+
+void
+complicatedFilter(hitReader &HR);
+
+
+
+// The simple filter just returns the top uniqThresh hits
+//
+void
+simpleFilter(hitReader &HR) {
+
+ decided = true;
+ label = "simple";
+ qualToSave = 0.0;
+ hitsToSave = HR.numHits();
+
+ if (HR.numHits() > uniqThresh)
+ hitsToSave = uniqThresh;
+}
+
+
+
+
+int
+main(int argc, char **argv) {
+
+ if (argc == 1) {
+ fprintf(stderr, "ESTmapper utility function -- not for human use.\n");
+ exit(1);
+ }
+
+ hitReader HR(argc);
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-uniquethreshold", 2) == 0) {
+ uniqThresh = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-repeatthreshold", 2) == 0) {
+ reptThresh = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-log", 2) == 0) {
+ ++arg;
+ errno = 0;
+ logFile = fopen(argv[arg], "w");
+ if (errno) {
+ fprintf(stderr, "filterEST: ERROR: couldn't open logFile '%s' for writing.\n%s\n", argv[arg], strerror(errno));
+ exit(1);
+ }
+ } else {
+ HR.addInputFile(argv[arg]);
+ }
+
+ arg++;
+ }
+
+ while (HR.loadHits()) {
+
+ // Not every filter we can think of needs the hits sorted, but
+ // it's nice to guarantee they are sorted.
+ //
+ HR.sortByCoverage();
+
+
+ //simpleFilter(HR);
+ complicatedFilter(HR);
+
+
+#ifdef WITH_ANSWERS
+ int tp=0, tn=0, fn=0, fp=0;
+#endif
+
+ // If we still haven't figured out what to do, then the EST is
+ // labeled a repeat. Otherwise, write the (filtered) hits to the
+ // file.
+ //
+ if (!decided) {
+ hitsUnknown += HR.numHits();
+
+#ifdef WITH_ANSWERS
+ // We've failed to classify all these hits, so anythig that looks good is a false negative
+ //
+ for (uint32 i=0; i < HR.numHits(); i++)
+ if ((HR[i].mappedCoverage >= goodCoverage) && (HR[i].mappedIdentity >= goodPercentID)) {
+ if (logFile) {
+ fprintf(logFile, "FAILUNKN hit="uint32FMTW(3)" id=%2d cv=%2d COV=%5.3f MUL=%5.3f: ", i, HR[i].mappedIdentity, HR[i].mappedCoverage, HR[i].coverage, HR[i].multiplicity);
+ ahit_printASCII(&HR[i].a, logFile);
+ }
+ filterFNunk++;
+ fn++;
+ } else {
+ tn++;
+ }
+#endif
+
+ } else {
+ for (uint32 i=0; i < HR.numHits(); i++) {
+ if ((i < hitsToSave) && (qualToSave <= HR[i].coverage)) {
+ hitsSaved++;
+ ahit_printASCII(&HR[i].a, stdout);
+
+#ifdef WITH_ANSWERS
+ if ((HR[i].mappedCoverage >= goodCoverage) && (HR[i].mappedIdentity >= goodPercentID)) {
+ tp++;
+ } else {
+ fp++;
+ }
+#endif
+ } else if (HR[i].a._merged) {
+ // We merged this hit, so scores are incorrect. Give it
+ // the benefit of the doubt and report it.
+ //
+ hitsSaved++;
+ ahit_printASCII(&HR[i].a, stdout);
+ } else {
+ hitsFiltered++;
+#ifdef WITH_ANSWERS
+ // Report hits that are false negatives
+ //
+ if ((HR[i].mappedCoverage >= goodCoverage) && (HR[i].mappedIdentity >= goodPercentID)) {
+ if (logFile) {
+ fprintf(logFile, "FAILFILT hit="uint32FMTW(3)" id=%2d cv=%2d COV=%5.3f MUL=%5.3f: ", i, HR[i].mappedIdentity, HR[i].mappedCoverage, HR[i].coverage, HR[i].multiplicity);
+ ahit_printASCII(&HR[i].a, logFile);
+ }
+ filterFNfilt++;
+ fn++;
+ } else {
+ tn++;
+ }
+#endif
+ }
+ }
+ }
+
+#ifdef WITH_ANSWERS
+ filterTP += tp;
+ filterTN += tn;
+ filterFP += fp;
+
+ if (tp > 0)
+ seqsMapped++;
+
+ if (fn > 0)
+ seqsPartial++;
+
+ if ((tp == 0) && (fn > 0))
+ seqsMissed++;
+#endif
+
+ hitsTotal += HR.numHits();
+
+#ifdef WITH_ANSWERS
+ // Report if we saw falsenegatives (we should have printed FAIL into the log, too)
+ //
+ if (fn > 0)
+ if (logFile)
+ fprintf(logFile, uint32FMT"] %sFALSENEGATIVE %10.10s tp="uint32FMTW(7)" fp="uint32FMTW(7)" fn="uint32FMTW(7)" tn="uint32FMTW(7)"\n",
+ HR.iid(),
+ (tp > 0) ? "partial" : "fatal",
+ label,
+ tp, fp, fn, tn);
+#endif
+
+
+ if ((HR.iid() % 500) == 0) {
+ report(HR.iid()
+#ifdef WITH_ANSWERS
+ ,
+ filterTP, filterFP, filterFNfilt, filterFNunk, filterTN,
+ seqsMapped, seqsPartial, seqsMissed
+#endif
+ );
+ fflush(stderr);
+ }
+ }
+
+ if (logFile)
+ fclose(logFile);
+
+
+ report(HR.iid()
+#ifdef WITH_ANSWERS
+ ,
+ filterTP, filterFP, filterFNfilt, filterFNunk, filterTN,
+ seqsMapped, seqsPartial, seqsMissed
+#endif
+ );
+ fprintf(stderr, "\n");
+
+ return(0);
+}
diff --git a/seagen/filterESTsimple.C b/seagen/filterESTsimple.C
new file mode 100644
index 0000000..2eefe13
--- /dev/null
+++ b/seagen/filterESTsimple.C
@@ -0,0 +1,71 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include "aHit.H"
+
+// A very simple filter.
+//
+// Output the top 50 hits or all hits above 0.2, whichever is _smaller_.
+
+#include "hitReader.H"
+
+#define UNIQ_THRESH 50
+#define QUAL_THRESH 0.2
+
+int
+main(int argc, char **argv) {
+
+ if (argc == 1) {
+ fprintf(stderr, "ESTmapper utility function -- not for human use.\n");
+ exit(1);
+ }
+
+ hitReader HR(argc);
+
+ // takes no args
+
+ int arg = 1;
+ while (arg < argc) {
+ HR.addInputFile(argv[arg]);
+ arg++;
+ }
+
+ while (HR.loadHits()) {
+
+ HR.sortByCoverage();
+
+ // Output top 'UNIQ_THRESH' hits
+
+ uint32 max = UNIQ_THRESH;
+
+ if (max >= HR.numHits())
+ max = HR.numHits();
+
+ for (uint32 i=0; i<max; i++)
+ ahit_printASCII(&HR[i].a, stdout);
+
+#if 0
+ uint32 count = 0;
+
+ for (uint32 i=0; i < HR.numHits(); i++)
+ if (QUAL_THRESH <= HR[i].coverage)
+ count++;
+
+ if ((count > 0) && (count < UNIQ_THRESH)) {
+ // Output all hits above QUAL_THRESH
+ for (uint32 i=0; i < HR.numHits(); i++)
+ if (QUAL_THRESH <= HR[i].coverage)
+ ahit_printASCII(&HR[i].a, stdout);
+ } else {
+ // Output top 'UNIQ_THRESH' hits
+ for (uint32 i=0; i < UNIQ_THRESH; i++)
+ ahit_printASCII(&HR[i].a, stdout);
+ }
+#endif
+
+ }
+
+ return(0);
+}
diff --git a/seagen/filterMRNA.C b/seagen/filterMRNA.C
new file mode 100644
index 0000000..7fcdaf7
--- /dev/null
+++ b/seagen/filterMRNA.C
@@ -0,0 +1,94 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include "aHit.H"
+#include "hitReader.H"
+
+
+int
+main(int argc, char **argv) {
+
+ if (argc < 2)
+ fprintf(stderr, "ESTmapper utility function -- not for human use.\n"), exit(1);
+
+ hitReader HR(argc);
+ double L = 0.2;
+ double H = 0.6;
+ double V = 0.7;
+ double M = 0.3;
+ double MC = 0.2;
+ uint32 ML = 150;
+ bool beVerbose = false;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-v", 2) == 0) {
+ beVerbose = true;
+ } else if (strcmp(argv[arg], "-l") == 0) {
+ L = atof(argv[++arg]);
+ } else if (strcmp(argv[arg], "-h") == 0) {
+ H = atof(argv[++arg]);
+ } else if (strcmp(argv[arg], "-v") == 0) {
+ V = atof(argv[++arg]);
+ } else if (strcmp(argv[arg], "-m") == 0) {
+ M = atof(argv[++arg]);
+ } else if (strcmp(argv[arg], "-mc") == 0) {
+ MC = atof(argv[++arg]);
+ } else if (strcmp(argv[arg], "-ml") == 0) {
+ ML = atoi(argv[++arg]);
+ } else {
+ HR.addInputFile(argv[arg]);
+ }
+
+ arg++;
+ }
+
+ if (beVerbose) {
+ fprintf(stderr, "Filtering with:\n");
+ fprintf(stderr, " score difference of %4.2f or less -> 100.0%% of best score\n", L);
+ fprintf(stderr, " score difference of %4.2f or more -> %5.1f%% of best score\n", H, 100*V);
+ fprintf(stderr, " scores at least %4.2f are always output\n", M);
+ fprintf(stderr, " scores at least %4.2f AND at least "uint32FMT" bases covered are always output\n", MC, ML);
+ }
+
+ while (HR.loadHits()) {
+ HR.sortByCoverage();
+
+ double hiCov = HR[0].coverage;
+ double loCov = HR[0].coverage;
+ for (uint32 i=0; i < HR.numHits(); i++)
+ if ((HR[i].a._merged == false) && (loCov > HR[i].coverage))
+ loCov = HR[i].coverage;
+
+ double h = hiCov - loCov;
+ double p = 0.0;
+
+ if (h <= L) p = 1.0;
+ if (h >= H) p = V;
+ if (p == 0.0) p = 1.0 - (1.0 - V) * (h - L) / (H - L);
+
+ // check p; it should be between V and 1.0
+ if ((p > 1.0) || (p < V))
+ fprintf(stderr, "error in p; p=%f\n", p);
+
+ // Output the top p% hits, by score.
+
+ double cutL = HR[0].coverage - p * h;
+ if (cutL > M)
+ cutL = M;
+
+ // Save the hit if it has good coverage and it's either above
+ // the minimum coverage or long. Also blindly save merged
+ // hits.
+ //
+ for (uint32 i=0; i < HR.numHits(); i++)
+ if (((cutL <= HR[i].coverage) && ((MC <= HR[i].coverage) ||
+ (ML <= HR[i].a._covered))) ||
+ (HR[i].a._merged))
+ ahit_printASCII(&HR[i].a, stdout);
+ }
+
+ return(0);
+}
diff --git a/seagen/filterNULL.C b/seagen/filterNULL.C
new file mode 100644
index 0000000..d399a7f
--- /dev/null
+++ b/seagen/filterNULL.C
@@ -0,0 +1,24 @@
+#include "aHit.H"
+#include "hitReader.H"
+
+// A NULL filter. What comes in, comes out. Seems useless, but the
+// hitReader merges overlapping hits which would otherwise screw up
+// your mapping.
+
+int
+main(int argc, char **argv) {
+ hitReader HR(argc);
+
+ if (argc < 2)
+ fprintf(stderr, "ESTmapper utility function -- not for human use.\n"), exit(1);
+
+ int arg = 1;
+ while (arg < argc)
+ HR.addInputFile(argv[arg++]);
+
+ while (HR.loadHits())
+ for (uint32 i=0; i < HR.numHits(); i++)
+ ahit_printASCII(&HR[i].a, stdout);
+
+ return(0);
+}
diff --git a/seagen/filtertest.C b/seagen/filtertest.C
new file mode 100644
index 0000000..31c1c0f
--- /dev/null
+++ b/seagen/filtertest.C
@@ -0,0 +1,325 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "bio.h"
+
+#define MAX_ESTS (16 * 1024 * 1024)
+#define MAX_HITS (18474961) // for 20-03-5000-0.4
+
+//#define SHOW_ONE
+
+////////////////////////////////////////
+
+struct aHit {
+ uint32 _forward;
+ uint32 _qsIdx;
+ uint32 _dsIdx;
+ uint32 _dsLo;
+ uint32 _dsHi;
+ uint32 _covered;
+ uint32 _matched;
+ uint32 _numMers;
+ uint32 _yesno;
+ uint32 _identity;
+ uint32 _coverage;
+
+ float scoreCov;
+ float scoreMult;
+};
+
+
+void ahit_writeBinary(aHit *a, FILE *F) {
+ fwrite(a, sizeof(aHit), 1, F);
+}
+
+void ahit_readBinary(aHit *a, FILE *F) {
+ fread(a, sizeof(aHit), 1, F);
+}
+
+void ahit_printASCII(aHit *a, FILE *F) {
+#ifdef TRUE64BIT
+ fprintf(F, "-%c -e %u -D %u %u %u -M %u %u %u %s %u %u\n",
+ a->_forward ? 'f' : 'r',
+ a->_qsIdx,
+ a->_dsIdx,
+ a->_dsLo,
+ a->_dsHi,
+ a->_covered,
+ a->_matched,
+ a->_numMers,
+ a->_yesno ? "-Y" : "-N",
+ a->_identity,
+ a->_coverage);
+#else
+ fprintf(F, "-%c -e %lu -D %lu %lu %lu -M %lu %lu %lu %s %lu %lu\n",
+ a->_forward ? 'f' : 'r',
+ a->_qsIdx,
+ a->_dsIdx,
+ a->_dsLo,
+ a->_dsHi,
+ a->_covered,
+ a->_matched,
+ a->_numMers,
+ a->_yesno ? "-Y" : "-N",
+ a->_identity,
+ a->_coverage);
+#endif
+}
+
+
+void ahit_parseString(aHit *a, char *b) {
+ char *c = b+1;
+
+ a->_forward = (*c == 'f');
+ c += 1;
+
+ if (c[2] != 'e') fprintf(stderr, "'%s' didn't get -e\n", b);
+
+ c += 4;
+ a->_qsIdx = (uint32)strtoul(c, &c, 10);
+
+ if (c[2] != 'D') fprintf(stderr, "'%s' didn't get -D\n", b);
+
+ c += 4;
+ a->_dsIdx = (uint32)strtoul(c, &c, 10);
+ a->_dsLo = (uint32)strtoul(c, &c, 10);
+ a->_dsHi = (uint32)strtoul(c, &c, 10);
+
+ if (c[2] == 'M') {
+ c += 4;
+ a->_covered = (uint32)strtoul(c, &c, 10);
+ a->_matched = (uint32)strtoul(c, &c, 10);
+ a->_numMers = (uint32)strtoul(c, &c, 10);
+ } else {
+ //fprintf(stderr, "'%s' didn't get -M\n", b);
+ a->_covered = 0;
+ a->_matched = 0;
+ a->_numMers = 0;
+ }
+
+ a->_yesno = 0;
+ a->_identity = 0;
+ a->_coverage = 0;
+
+ if (c[2] == 'Y') {
+ c += 4;
+ a->_yesno = 1;
+ a->_identity = (uint32)strtoul(c, &c, 10);
+ a->_coverage = (uint32)strtoul(c, &c, 10);
+ }
+
+#if 0
+ if (c[2] == 'N') {
+ c += 4;
+ a->_yesno = 0;
+ a->_identity = (uint32)strtoul(c, &c, 10);
+ a->_coverage = (uint32)strtoul(c, &c, 10);
+ }
+#endif
+}
+
+////////////////////////////////////////
+
+int
+hitCompare(const void *a, const void *b) {
+ const aHit *A = (const aHit *)a;
+ const aHit *B = (const aHit *)b;
+
+ if (A->scoreCov > B->scoreCov)
+ return(-1);
+ else
+ return(A->scoreCov < B->scoreCov);
+}
+
+int
+hitCompareID(const void *a, const void *b) {
+ const aHit *A = (const aHit *)a;
+ const aHit *B = (const aHit *)b;
+
+ if (A->_qsIdx < B->_qsIdx)
+ return(-1);
+ if (A->_qsIdx > B->_qsIdx)
+ return(1);
+ return(0);
+}
+
+
+
+int
+main(int argc, char **argv) {
+ aHit *hits = new aHit [MAX_HITS];
+ uint32 hitsLen = 0;
+
+
+ // read all the hits from stdin -- assumes ascii format
+ //
+ char hitLine[1025];
+
+ while (!feof(stdin)) {
+ fgets(hitLine, 1024, stdin);
+ if (!feof(stdin)) {
+ ahit_parseString(hits + hitsLen, hitLine);
+
+ // These are the scores used by the filter
+ //
+ hits[hitsLen].scoreCov = (float)hits[hitsLen]._covered / (float)hits[hitsLen]._numMers;
+ hits[hitsLen].scoreMult = (float)hits[hitsLen]._matched / (float)hits[hitsLen]._covered;
+
+ // aHit->_covered is in bases, but aHit->_numMers is the
+ // number of mers. Possible for coverage to be > 1.0.
+ //
+ if (hits[hitsLen].scoreCov > 1.0)
+ hits[hitsLen].scoreCov = 1.0;
+
+ hitsLen++;
+
+ if ((hitsLen & 0xff) == 0) {
+ fprintf(stderr, "reading hits "uint32FMT"\r", hitsLen);
+ fflush(stderr);
+ }
+ }
+ }
+
+ fprintf(stderr, "reading hits "uint32FMT"\n", hitsLen);
+
+
+ // Sort the hits by estid
+ //
+ fprintf(stderr, "sorting hits by cDNA\n");
+ qsort(hits, hitsLen, sizeof(aHit), hitCompareID);
+
+
+ // Sort the hits by score (scoreCov), in decreasing order.
+ //
+ fprintf(stderr, "sorting hits by score\n");
+ for (uint32 currentHit = 0; currentHit < hitsLen; ) {
+ uint32 estOfInterest = hits[currentHit]._qsIdx;
+ uint32 numHits = 0;
+ for (uint32 t=currentHit; (t < hitsLen) && (hits[t]._qsIdx == estOfInterest); t++)
+ numHits++;
+
+ qsort(hits + currentHit, numHits, sizeof(aHit), hitCompare);
+
+ currentHit += numHits;
+ }
+
+ fprintf(stderr, "filtering hits\n");
+
+ double L = 0.0;
+ double H = 0.0;
+ double V = 0.1;
+ double M = 1.0;
+ double MC = 0.0;
+ uint32 ML = 0;
+
+ double minIdentity = 98.0;
+ double minCoverage = 96.0;
+
+ for (uint32 Hcnt = 10; Hcnt <= 100; Hcnt += 10) {
+ for (uint32 Lcnt = 10; Lcnt < Hcnt && Lcnt < 60; Lcnt += 10) {
+ for (uint32 Vcnt = 10; Vcnt < 100; Vcnt += 10) {
+#ifdef SHOW_ONE
+ Lcnt = 30;
+ Hcnt = 40;
+ Vcnt = 100;
+#endif
+ L = Lcnt / 100.0;
+ H = Hcnt / 100.0;
+ V = Vcnt / 100.0;
+
+ uint32 truepositive = 0;
+ uint32 falsepositive = 0;
+ uint32 truenegative = 0;
+ uint32 falsenegative = 0;
+
+ for (uint32 currentHit = 0; currentHit < hitsLen; ) {
+
+ // Find the number of hits for this ESTid
+ //
+ uint32 estOfInterest = hits[currentHit]._qsIdx;
+ uint32 numHits = 0;
+ for (uint32 t=currentHit; (t < hitsLen) && (hits[t]._qsIdx == estOfInterest); t++)
+ numHits++;
+
+ double h = hits[currentHit].scoreCov - hits[currentHit + numHits - 1].scoreCov;
+ double p = 0.0;
+
+ if (h <= L) p = 1.0;
+ if (h >= H) p = V;
+ if (p == 0.0) p = 1.0 - (1.0 - V) * (h - L) / (H - L);
+
+ // check p; it should be between V and 1.0
+ if (p > 1.0) {
+ fprintf(stderr, "error in p; p=%f h=%f (%f %f %f)\n", p, h, L, H, V);
+ p = 1.0;
+ }
+
+ if (p < V) {
+ fprintf(stderr, "error in p; p=%f h=%f (%f %f %f)\n", p, h, L, H, V);
+ p = V;
+ }
+
+ // Output the top p% hits, by score.
+ //
+ double cutL = hits[currentHit].scoreCov - p * h;
+
+ if (cutL > M)
+ cutL = M;
+
+#ifdef SHOW_ONE
+ fprintf(stdout, "LHV = %f %f %f p=%f h=%f cutL=%f\n", L, H, V, p, h, cutL);
+#endif
+
+
+ for (uint32 i=currentHit; i < currentHit + numHits; i++) {
+ if ((cutL <= hits[i].scoreCov) &&
+ ((MC <= hits[i].scoreCov) || (ML <= hits[i]._covered))) {
+#ifdef SHOW_ONE
+ fprintf(stdout, "POS: (%f)", hits[i].scoreCov);
+ ahit_printASCII(hits+i, stdout);
+#endif
+ if ((hits[i]._yesno == 1) && (hits[i]._identity >= minIdentity) && (hits[i]._coverage >= minCoverage))
+ truepositive++;
+ else
+ falsepositive++;
+ } else {
+#ifdef SHOW_ONE
+ fprintf(stdout, "NEG: (%f)", hits[i].scoreCov);
+ ahit_printASCII(hits+i, stdout);
+#endif
+ if ((hits[i]._yesno == 1) && (hits[i]._identity >= minIdentity) && (hits[i]._coverage >= minCoverage))
+ falsenegative++;
+ else
+ truenegative++;
+ }
+ }
+
+#ifdef SHOW_ONE
+ fprintf(stdout, "----\n");
+#endif
+ currentHit += numHits;
+ }
+
+ // Print L, H, V, sensitivity, specificity
+ //
+ fprintf(stdout, "%f %f %f %6.4f %6.4f "uint32FMT" "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ L, H, V,
+ (double)truepositive / (truepositive + falsenegative),
+ (double)truenegative / (truenegative + falsepositive),
+ truepositive, falsepositive,
+ truenegative, falsenegative);
+ fflush(stdout);
+
+#ifdef SHOW_ONE
+ exit(0);
+#endif
+ }
+ }
+ }
+
+ return(0);
+}
+
diff --git a/seagen/hitConverter.C b/seagen/hitConverter.C
new file mode 100644
index 0000000..20ecd7b
--- /dev/null
+++ b/seagen/hitConverter.C
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "aHit.H"
+
+void
+bin2asc(FILE *I, FILE *O) {
+ uint32 i = 0;
+ aHit a;
+
+ fprintf(stderr, "Converting BINARY to ASCII.\n");
+
+ while (!feof(I)) {
+ ahit_readBinary(&a, I);
+
+ if (!feof(I)) {
+ ahit_printASCII(&a, O);
+
+ if ((++i & 0xffff) == 0) {
+ fprintf(stderr, uint32FMT" hits.\r", i);
+ fflush(stderr);
+ }
+ }
+ }
+
+ fprintf(stderr, uint32FMT" hits.\r", i);
+ fprintf(stderr, "\n");
+}
+
+
+void
+asc2bin(FILE *I, FILE *O) {
+ uint32 i = 0;
+ aHit a;
+ char b[1025];
+
+ fprintf(stderr, "Converting ASCII to BINARY.\n");
+
+ while (!feof(I)) {
+ fgets(b, 1024, I);
+
+ if (!feof(I)) {
+ ahit_parseString(&a, b);
+ ahit_writeBinary(&a, O);
+
+ if ((++i & 0xffff) == 0) {
+ fprintf(stderr, uint32FMT" hits.\r", i);
+ fflush(stderr);
+ }
+ }
+ }
+
+ fprintf(stderr, uint32FMT" hits.\r", i);
+ fprintf(stderr, "\n");
+}
+
+
+int
+main(int argc, char **argv) {
+
+ if (argc != 1) {
+ fprintf(stderr, "%s: I only read stdin and write stdout.\n", argv[0]);
+ exit(1);
+ }
+
+ // If the first character in the stream is ascii, do ASCII -> BINARY.
+ // Else, do BINARY -> ASCII.
+ //
+ char x = (char)fgetc(stdin);
+ ungetc(x, stdin);
+
+ if (x == '-')
+ asc2bin(stdin, stdout);
+ else
+ bin2asc(stdin, stdout);
+
+ return(0);
+}
diff --git a/seagen/hitMatrix-sort.C b/seagen/hitMatrix-sort.C
new file mode 100644
index 0000000..714fffb
--- /dev/null
+++ b/seagen/hitMatrix-sort.C
@@ -0,0 +1,82 @@
+#include "hitMatrix.H"
+
+// Sort by dsPos
+
+inline
+void
+adjustHeap_dsPos(diagonalLine *L, uint32 p, uint32 n) {
+ uint32 q = L[p]._qsPos;
+ uint32 d = L[p]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ uint32 l = L[p]._diagonalID;
+#endif
+ uint32 c = (p << 1) + 1; // let c be the left child of p
+
+ while (c < n) {
+
+ // Find the larger of the two children
+ //
+ if ((c+1 < n) && (L[c]._dsPos < L[c+1]._dsPos))
+ c++;
+
+ // Does the node in question fit here?
+ //
+ if (d >= L[c]._dsPos)
+ break;
+
+ // Else, swap the parent and the child
+ //
+ L[p]._qsPos = L[c]._qsPos;
+ L[p]._dsPos = L[c]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ L[p]._diagonalID = L[c]._diagonalID;
+#endif
+
+ // Move down the tree
+ //
+ p = c;
+ c = (p << 1) + 1;
+ }
+
+ L[p]._qsPos = q;
+ L[p]._dsPos = d;
+#ifndef WITHOUT_DIAGONALID
+ L[p]._diagonalID = l;
+#endif
+}
+
+void
+hitMatrix::sort_dsPos(void) {
+
+ if (_hitsLen > 1) {
+
+ // Create the heap of lines.
+ //
+ for (uint32 i=_hitsLen/2; i--; )
+ adjustHeap_dsPos(_hits, i, _hitsLen);
+
+ // Interchange the new maximum with the element at the end of the tree
+ //
+ for (uint32 i=_hitsLen-1; i>0; i--) {
+ uint32 q = _hits[i]._qsPos;
+ uint32 d = _hits[i]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ uint32 l = _hits[i]._diagonalID;
+#endif
+
+ _hits[i]._qsPos = _hits[0]._qsPos;
+ _hits[i]._dsPos = _hits[0]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ _hits[i]._diagonalID = _hits[0]._diagonalID;
+#endif
+
+ _hits[0]._qsPos = q;
+ _hits[0]._dsPos = d;
+#ifndef WITHOUT_DIAGONALID
+ _hits[0]._diagonalID = l;
+#endif
+
+ adjustHeap_dsPos(_hits, 0, i);
+ }
+ }
+}
diff --git a/seagen/hitMatrix.C b/seagen/hitMatrix.C
new file mode 100644
index 0000000..7ee727d
--- /dev/null
+++ b/seagen/hitMatrix.C
@@ -0,0 +1,658 @@
+#include "searchGENOME.H"
+#include "aHit.H"
+
+#define TRACE 0
+
+hitMatrix::hitMatrix(uint32 qsLen, uint32 qsMers, uint32 qsIdx) {
+ _qsLen = qsLen;
+ _qsMers = qsMers;
+ _qsIdx = qsIdx;
+
+ _hitsLen = 0;
+ _hitsMax = 128;
+ _hits = new diagonalLine [_hitsMax];
+
+ _matches = 0L;
+}
+
+
+hitMatrix::~hitMatrix() {
+ delete [] _hits;
+}
+
+void
+hitMatrix::addMatch(uint32 qsLo,
+ uint32 qsHi,
+ uint32 dsLo,
+ uint32 dsHi,
+ merCovering *IL) {
+ uint32 offset = 0;
+
+ // Extend the match
+ //
+ // Two methods: the first uses hardcoded parameters, has two
+ // plateau's, and is the one to use for ESTs and mRNA in ESTmapper.
+ // The second is paramterized, and has a single plateau.
+ //
+
+ if (config._extendAlternate) {
+ offset = config._extendWeight * qsLo;
+
+ if (offset < config._extendMinimum)
+ offset = config._extendMinimum;
+
+ if (dsLo < offset)
+ dsLo = 0;
+ else
+ dsLo -= offset;
+
+ offset = config._extendWeight * (_qsLen - qsHi);
+
+ if (offset < config._extendMinimum)
+ offset = config._extendMinimum;
+
+ dsHi += offset;
+ } else {
+ // If the start of the match is near the start of the EST, we do
+ // not need to search very far in the genome.
+ //
+ offset = 0;
+ if (qsLo < 50)
+ offset = 2000;
+ else
+ if (qsLo < 100)
+ offset = 5000;
+ else
+ offset = 50 * qsLo;
+
+ if (dsLo < offset)
+ dsLo = 0;
+ else
+ dsLo -= offset;
+
+ // Likewise, if the match is near the end of the EST, extend. We
+ // don't know the length of the genomic sequence, so we can't check
+ // for "overflow".
+ //
+ offset = _qsLen - qsHi;
+ if (offset < 50)
+ dsHi += 2000;
+ else
+ if (offset < 100)
+ dsHi += 5000;
+ else
+ dsHi += 50 * offset;
+ }
+
+
+
+ // Create a new match
+ //
+ // n = new match
+ // m = current match
+ // l = last match
+ //
+ trapMatch *n = new trapMatch(qsLo, qsHi, dsLo, dsHi, IL);
+
+
+ // And find a home for it in the list. No merging of matches is done here. It's
+ // too hard.
+ //
+ if ((_matches == 0L) || (n->_dsHi > _matches->_dsHi)) {
+ n->_next = _matches;
+ _matches = n;
+ } else {
+ trapMatch *l = _matches;
+ trapMatch *m = _matches->_next;
+
+ while ((m) && (n->_dsHi < m->_dsHi)) {
+ l = m;
+ m = m->_next;
+ }
+
+ n->_next = m;
+ l->_next = n;
+ }
+}
+
+
+// Utility for sorting the diagonal lines in the hitMatrix
+//
+// The two comparison functions return true if the first line
+// is less than the second line.
+
+#ifdef WITHOUT_DIAGONALID
+
+inline
+int
+compareLines(diagonalLine *A, diagonalLine *B, uint32 qsLen) {
+ uint32 a = qsLen - A->_qsPos - 1 + A->_dsPos;
+ uint32 b = qsLen - B->_qsPos - 1 + B->_dsPos;
+
+ return(((a < b)) ||
+ ((a == b) && (A->_qsPos < B->_qsPos)));
+}
+
+inline
+int
+compareLines(uint32 l, uint32 q, diagonalLine *B, uint32 qsLen) {
+ uint32 b = qsLen - B->_qsPos - 1 + B->_dsPos;
+
+ return(((l < b)) ||
+ ((l == b) && (q < B->_qsPos)));
+}
+
+inline
+void
+adjustHeap(diagonalLine *L, int32 p, int32 n, uint32 qsLen) {
+ uint32 q = L[p]._qsPos;
+ uint32 d = L[p]._dsPos;
+ uint32 l = qsLen - q - 1 + d;
+ int32 c = (p << 1) + 1; // let c be the left child of p
+
+ while (c < n) {
+
+ // Find the larger of the two children
+ //
+ if ((c+1 < n) && compareLines(L+c, L+c+1, qsLen))
+ c++;
+
+ // Does the node in question fit here?
+ //
+ if (compareLines(l, q, L+c, qsLen) == false)
+ break;
+
+ // Else, swap the parent and the child
+ //
+ L[p]._qsPos = L[c]._qsPos;
+ L[p]._dsPos = L[c]._dsPos;
+
+ // Move down the tree
+ //
+ p = c;
+ c = (p << 1) + 1;
+ }
+
+ L[p]._qsPos = q;
+ L[p]._dsPos = d;
+}
+
+
+#else // WITH_DIAGONALID
+
+
+inline
+int
+compareLines(diagonalLine *A, diagonalLine *B) {
+ return(((A->_diagonalID < B->_diagonalID)) ||
+ ((A->_diagonalID == B->_diagonalID) && (A->_qsPos < B->_qsPos)));
+}
+
+inline
+int
+compareLines(uint32 l, uint32 q, diagonalLine *B) {
+ return(((l < B->_diagonalID)) ||
+ ((l == B->_diagonalID) && (q < B->_qsPos)));
+}
+
+inline
+void
+adjustHeap(diagonalLine *L, int32 p, int32 n) {
+ uint32 q = L[p]._qsPos;
+ uint32 d = L[p]._dsPos;
+ uint32 l = L[p]._diagonalID;
+ int32 c = (p << 1) + 1; // let c be the left child of p
+
+ while (c < n) {
+
+ // Find the larger of the two children
+ //
+ if ((c+1 < n) && compareLines(L+c, L+c+1))
+ c++;
+
+ // Does the node in question fit here?
+ //
+ if (compareLines(l, q, L+c) == false)
+ break;
+
+ // Else, swap the parent and the child
+ //
+ L[p]._qsPos = L[c]._qsPos;
+ L[p]._dsPos = L[c]._dsPos;
+ L[p]._diagonalID = L[c]._diagonalID;
+
+ // Move down the tree
+ //
+ p = c;
+ c = (p << 1) + 1;
+ }
+
+ L[p]._qsPos = q;
+ L[p]._dsPos = d;
+ L[p]._diagonalID = l;
+}
+
+
+#endif
+
+void
+hitMatrix::filter(encodedQuery *query, bool isReverse) {
+
+ if (_hitsLen == 0)
+ return;
+
+ // Decide on the minimum quality values; we pick the larger of
+ // the fixed lengths, and the sequence length * coverage.
+ //
+ uint32 minLengthSingle = (uint32)(config._minCoverageSingle * _qsLen);
+ uint32 minLengthMultiple = (uint32)(config._minCoverageMultiple * _qsLen);
+
+ if (minLengthSingle < config._minLengthSingle)
+ minLengthSingle = config._minLengthSingle;
+
+ if (minLengthMultiple < config._minLengthMultiple)
+ minLengthMultiple = config._minLengthMultiple;
+
+
+
+ // First, sort by the dsPos. This is done so that we can find all the hits for
+ // a specific scaffold.
+ //
+ sort_dsPos();
+
+
+ // Now, while there are hits left....
+ //
+ uint32 firstHit = 0;
+ uint32 lastHit = 0;
+ uint32 currentSeq = 0;
+
+ while (firstHit < _hitsLen) {
+
+ // Move the currentSeq until the firstHit is below it.
+ //
+ while ((currentSeq < config._dbSTREAM->numberOfSequences()) &&
+ (config._dbSTREAM->startOf(currentSeq) <= _hits[firstHit]._dsPos))
+ currentSeq++;
+
+ //
+ // currentSeq is now the sequence AFTER the one that we want hits in.
+ //
+
+ // Find the first hit that is in currentSeq. If this is the last sequence,
+ // then, of course, all remaining hits are in it.
+ //
+ if (currentSeq < config._dbSTREAM->numberOfSequences()) {
+ lastHit = firstHit + 1;
+ while ((lastHit < _hitsLen) &&
+ (_hits[lastHit]._dsPos < config._dbSTREAM->startOf(currentSeq)))
+ lastHit++;
+ } else {
+ lastHit = _hitsLen;
+ }
+
+ // Drop back one sequence; this is the sequence the hits are in.
+ //
+ currentSeq--;
+
+#if TRACE
+ fprintf(stdout, "Hits are in sequence %d\n", config._dbSTREAM->IIDOf(currentSeq));
+ fprintf(stdout, "filtering %u hits -- first = %u last = %u.\n", _hitsLen, firstHit, lastHit);
+
+#if 0
+ fprintf(stdout, "UNSORTED\n");
+ for (uint32 i=firstHit; i<lastHit; i++)
+ fprintf(stdout, "hit at qs=%4u ds=%6u diag=%6u\n",
+ _hits[i]._qsPos,
+ _hits[i]._dsPos,
+ _hits[i]._diagonalID);
+#endif
+#endif
+
+ // Adjust the hits to be relative to the start of this sequence
+ //
+ for (uint32 i=firstHit; i<lastHit; i++)
+ _hits[i]._dsPos -= config._dbSTREAM->startOf(currentSeq);
+
+ // Sort them, if needed.
+ //
+ if (lastHit - firstHit > 1) {
+
+ // We cheat; heapsort isn't too friendly to sorting the middle of
+ // an array, so we make a new array in the middle!
+ //
+ diagonalLine *hitsToSort = _hits + firstHit;
+
+ // Build the heap. I initially thought this could be done at the
+ // same time as the scan for the last hit, but it can't (easily)
+ //
+ for (int32 i=(lastHit - firstHit)/2 - 1; i>=0; i--)
+#ifdef WITHOUT_DIAGONALID
+ adjustHeap(hitsToSort, i, lastHit - firstHit, _qsLen);
+#else
+ adjustHeap(hitsToSort, i, lastHit - firstHit);
+#endif
+
+ // Sort the hits be diagonal. This is the second part of
+ // heap sort -- Interchange the new maximum with the element
+ // at the end of the tree
+ //
+ for (uint32 i=lastHit - firstHit - 1; i>0; i--) {
+ uint32 q = hitsToSort[i]._qsPos;
+ uint32 d = hitsToSort[i]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ uint32 l = hitsToSort[i]._diagonalID;
+#endif
+
+ hitsToSort[i]._qsPos = hitsToSort[0]._qsPos;
+ hitsToSort[i]._dsPos = hitsToSort[0]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ hitsToSort[i]._diagonalID = hitsToSort[0]._diagonalID;
+#endif
+
+ hitsToSort[0]._qsPos = q;
+ hitsToSort[0]._dsPos = d;
+#ifndef WITHOUT_DIAGONALID
+ hitsToSort[0]._diagonalID = l;
+#endif
+
+#ifdef WITHOUT_DIAGONALID
+ adjustHeap(hitsToSort, 0, i, _qsLen);
+#else
+ adjustHeap(hitsToSort, 0, i);
+#endif
+ }
+ }
+
+
+ // Check the sorting
+ //
+#if 0
+#if 0
+ fprintf(stderr, "sort by diagonal:\n");
+ for (uint32 i=firstHit; i<lastHit; i++)
+ fprintf(stderr, "%8u %8u %8u\n", _hits[i]._diagonalID, _hits[i]._qsPos, _hits[i]._dsPos);
+#endif
+ for (uint32 i=firstHit; i<lastHit-1; i++) {
+ if (_hits[i]._diagonalID > _hits[i+1]._diagonalID) {
+ fprintf(stderr, "sort by diagonal failed.\n");
+ exit(1);
+ }
+ }
+#endif
+
+
+
+#if TRACE
+#if 0
+ fprintf(stdout, "SORTED\n");
+ for (uint32 i=firstHit; i<lastHit; i++)
+ fprintf(stdout, "hit at qs=%4u ds=%6u diag=%6u\n",
+ _hits[i]._qsPos,
+ _hits[i]._dsPos,
+ _hits[i]._diagonalID);
+#endif
+
+ fprintf(stdout, "FILTERED\n");
+#endif
+
+ // Filter them
+ //
+#ifdef WITHOUT_DIAGONALID
+ uint32 frstDiagonal = _qsLen - _hits[firstHit]._qsPos - 1 + _hits[firstHit]._dsPos;
+ uint32 lastDiagonal = frstDiagonal;
+#else
+ uint32 frstDiagonal = _hits[firstHit]._diagonalID;
+ uint32 lastDiagonal = _hits[firstHit]._diagonalID;
+#endif
+ uint32 qsLow = _hits[firstHit]._qsPos;
+ uint32 qsHigh = _hits[firstHit]._qsPos;
+ uint32 dsLow = _hits[firstHit]._dsPos;
+ uint32 dsHigh = _hits[firstHit]._dsPos;
+
+ // Create a new merCovering, and space to count the number of mers in a match
+ //
+ merCovering *IL = new merCovering(config._merSize);
+
+ for (uint32 i=firstHit; i<lastHit; i++) {
+#ifdef WITHOUT_DIAGONALID
+ uint32 thisDiagonalID = _qsLen - _hits[i]._qsPos - 1 + _hits[i]._dsPos;
+#else
+ uint32 thisDiagonalID = _hits[i]._diagonalID;
+#endif
+
+
+
+#if TRACE
+ fprintf(stdout, "hit[qs=%6u ds=%7u d=%7u] box[qs=%6u-%6u ds=%7u-%7u d=%7u-%7u] ",
+ _hits[i]._qsPos,
+ _hits[i]._dsPos,
+ thisDiagonalID,
+ qsLow, qsHigh, dsLow, dsHigh, frstDiagonal, lastDiagonal);
+#endif
+
+ // Unconditionally extend if the diagonal difference is small.
+ //
+ if (lastDiagonal + config._maxDiagonal >= thisDiagonalID) {
+ lastDiagonal = thisDiagonalID;
+ if (qsLow > _hits[i]._qsPos) qsLow = _hits[i]._qsPos;
+ if (qsHigh < _hits[i]._qsPos) qsHigh = _hits[i]._qsPos;
+ if (dsLow > _hits[i]._dsPos) dsLow = _hits[i]._dsPos;
+ if (dsHigh < _hits[i]._dsPos) dsHigh = _hits[i]._dsPos;
+ IL->addMer(_hits[i]._qsPos);
+#if TRACE
+ fprintf(stdout, "extend qs=%9u-%9u ds=%9u-%9u diag=%9u-%9u (diagonal)\n",
+ qsLow, qsHigh, dsLow, dsHigh, frstDiagonal, lastDiagonal);
+#endif
+ continue;
+ }
+
+
+ // XXX: Prototype for extending only if the next hit is near
+ // the last hit.
+ //
+ if (((dsHigh <= _hits[i]._dsPos) && (_hits[i]._dsPos - dsHigh <= config._maxIntronLength)) ||
+ ((dsHigh >= _hits[i]._dsPos) && (dsHigh - _hits[i]._dsPos <= config._maxIntronLength))) {
+
+ // Extend into multiple-exon like things only if the input
+ // sequence is long.
+ //
+ if (_qsLen > config._smallSequenceCutoff) {
+
+ // Extend if the qsOverlap is small (or nonexistant)
+ //
+ if ((qsHigh + config._merSize) < (_hits[i]._qsPos + config._qsOverlap)) {
+ lastDiagonal = thisDiagonalID;
+ if (qsLow > _hits[i]._qsPos) qsLow = _hits[i]._qsPos;
+ if (qsHigh < _hits[i]._qsPos) qsHigh = _hits[i]._qsPos;
+ if (dsLow > _hits[i]._dsPos) dsLow = _hits[i]._dsPos;
+ if (dsHigh < _hits[i]._dsPos) dsHigh = _hits[i]._dsPos;
+ IL->addMer(_hits[i]._qsPos);
+#if TRACE
+ fprintf(stdout, "extend qs=%9u-%9u ds=%9u-%9u diag=%9u-%9u (qsOverlap)\n",
+ qsLow, qsHigh, dsLow, dsHigh, frstDiagonal, lastDiagonal);
+#endif
+ continue;
+ }
+
+ // Extend if the dsOverlap is small (or nonexistant)
+ //
+ if (_hits[i]._dsPos < (dsLow + config._dsOverlap)) {
+ lastDiagonal = thisDiagonalID;
+ if (qsLow > _hits[i]._qsPos) qsLow = _hits[i]._qsPos;
+ if (qsHigh < _hits[i]._qsPos) qsHigh = _hits[i]._qsPos;
+ if (dsLow > _hits[i]._dsPos) dsLow = _hits[i]._dsPos;
+ if (dsHigh < _hits[i]._dsPos) dsHigh = _hits[i]._dsPos;
+ IL->addMer(_hits[i]._qsPos);
+#if TRACE
+ fprintf(stdout, "extend qs=%9u-%9u ds=%9u-%9u diag=%9u-%9u (dsOverlap)\n",
+ qsLow, qsHigh, dsLow, dsHigh, frstDiagonal, lastDiagonal);
+#endif
+ continue;
+ }
+ }
+ } // XXX: End prototype
+
+#if TRACE
+ fprintf(stdout, "close current cluster.\nGOOD? qsCov=%u; >= %u or %u? diag: %u < 25?\n",
+ qsHigh - qsLow,
+ minLengthSingle,
+ minLengthMultiple,
+ lastDiagonal - frstDiagonal);
+#endif
+
+
+ // Save the current cluster and start a new one?
+ //
+ uint32 qCov = IL->sumOfLengths();
+ if ((qCov >= minLengthMultiple) ||
+ ((lastDiagonal - frstDiagonal < 25) && (qCov >= minLengthSingle))) {
+#if TRACE
+ fprintf(stdout, "add match!\n");
+#endif
+ addMatch(qsLow,
+ qsHigh + config._merSize,
+ dsLow,
+ dsHigh + config._merSize,
+ IL);
+ IL = new merCovering(config._merSize);
+ }
+
+ if (IL)
+ IL->clear();
+
+#if TRACE
+ fprintf(stdout, "reset!\n");
+#endif
+
+ frstDiagonal = thisDiagonalID;
+ lastDiagonal = thisDiagonalID;
+ qsLow = _hits[i]._qsPos;
+ qsHigh = _hits[i]._qsPos;
+ dsLow = _hits[i]._dsPos;
+ dsHigh = _hits[i]._dsPos;
+
+#if TRACE
+ fprintf(stdout, "hit[qs=%6u ds=%7u d=%7u] box[qs=%6u-%6u ds=%7u-%7u d=%7u-%7u] (initial hit)\n",
+ _hits[i]._qsPos,
+ _hits[i]._dsPos,
+ _qsLen - _hits[i]._qsPos - 1 + _hits[i]._dsPos,
+ qsLow, qsHigh, dsLow, dsHigh, frstDiagonal, lastDiagonal);
+#endif
+
+ IL->addMer(_hits[i]._qsPos);
+ }
+
+ // Save the final cluster?
+ //
+ uint32 qCov = IL->sumOfLengths();
+ if ((qCov >= minLengthMultiple) ||
+ ((lastDiagonal - frstDiagonal < 21) && (qCov >= minLengthSingle))) {
+ addMatch(qsLow,
+ qsHigh + config._merSize,
+ dsLow,
+ dsHigh + config._merSize,
+ IL);
+ IL = 0;
+ }
+
+ // Delete any remaining IL
+ //
+ delete IL;
+
+
+
+ // Merge and print the matches
+ //
+ trapMatch *n = 0L;
+ uint32 ML = 0;
+
+ while (_matches) {
+
+ // Save the current match, then delete it.
+ //
+ dsLow = _matches->_dsLo;
+ dsHigh = _matches->_dsHi;
+ IL = _matches->_IL;
+ ML = IL->sumOfLengths();
+
+ n = _matches;
+ _matches = _matches->_next;
+ delete n;
+
+#if TRACE
+ fprintf(stdout, "Merge: %8u %8u\n", dsLow, dsHigh);
+#endif
+
+ // Assimilate as many of the remaining matches as possible.
+ //
+ // Think of this as first reversing the list, then merging as
+ // long as (dsHigh + 1000 > _matches->_dsLo). But since we
+ // don't reverse the list, we can map:
+ // dsHigh --> _matches->dsHi
+ // _matches->_dsLo --> dsLow
+ // where dsHigh and dsLow are the values for the extended match.
+ //
+ while (_matches && (dsLow < _matches->_dsHi + 5000)) {
+
+ // Combine the two merCoverings
+ //
+ IL->merge(_matches->_IL);
+ ML += _matches->_IL->sumOfLengths();
+
+ // The start of the new match might be after the start of the
+ // merged region. (Only rarely is it before)
+ //
+ if (dsLow > _matches->_dsLo)
+ dsLow = _matches->_dsLo;
+
+ // The end of current match is always greater than the end of the
+ // new match!
+ //
+ //dsHigh = _matches->_dsHi;
+
+#if TRACE
+ fprintf(stdout, "Merge: %8u %8u -> %8u %8u\n", _matches->_dsLo, _matches->_dsHi, dsLow, dsHigh);
+#endif
+
+ n = _matches;
+ _matches = _matches->_next;
+ delete n->_IL;
+ delete n;
+ }
+
+
+ if (config._binaryOutput) {
+ aHit a;
+
+ a._forward = !isReverse;
+ a._merged = false;
+ a._qsIdx = _qsIdx;
+ a._dsIdx = config._dbSTREAM->IIDOf(currentSeq);
+ a._dsLo = dsLow;
+ a._dsHi = dsHigh;
+ a._covered = IL->sumOfLengths();
+ a._matched = ML;
+ a._numMers = _qsMers;
+
+ query->addOutput(&a, sizeof(aHit));
+ } else {
+ char line[128];
+
+ sprintf(line, "-%c -e "uint32FMT" -D "uint32FMT" "uint32FMT" "uint32FMT" -M "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ isReverse ? 'r' : 'f', _qsIdx,
+ config._dbSTREAM->IIDOf(currentSeq),
+ dsLow, dsHigh, IL->sumOfLengths(), ML, _qsMers);
+
+ query->addOutput(line, 0);
+ }
+
+ delete IL;
+ }
+
+ // All done with these hits. Move to the next set.
+ //
+ firstHit = lastHit;
+ }
+}
+
diff --git a/seagen/hitMatrix.H b/seagen/hitMatrix.H
new file mode 100644
index 0000000..996b721
--- /dev/null
+++ b/seagen/hitMatrix.H
@@ -0,0 +1,156 @@
+#ifndef HITMATRIX_H
+#define HITMATRIX_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <new>
+
+#include "bio++.H"
+#include "positionDB.H"
+#include "encodedQuery.H"
+
+// Define this to cut the space required for storing hits by 1/3 --
+// from 12 byyes to 8 bytes -- at a slight computational expense --
+// negligible on real hardware, I hope.
+//
+// The original definition of diagonalID was
+// qsLen - qsPos - 1 + dsPos
+// but qsLen is fixed for everyone, so we could reduce it to
+// dsPos - qsPos
+// but that's not unsigned.
+//
+// Results: on a human mapping, using chromosomes as the stream and
+// the whole human as the table (so we need to actually store a large
+// number of hits), we see a savings of 2GB and a small drop in
+// runtime. Process size went from 20.7GB to 18.7GB, CPU time from
+// 20578 to 20193 seconds (833MHz EV6.8AL (21264B)).
+//
+#define WITHOUT_DIAGONALID
+
+
+struct diagonalLine {
+ uint32 _qsPos;
+ uint32 _dsPos;
+#ifndef WITHOUT_DIAGONALID
+ uint32 _diagonalID;
+#endif
+};
+
+class trapMatch {
+public:
+ uint32 _qsLo, _qsHi;
+ uint32 _dsLo, _dsHi;
+ merCovering *_IL;
+ trapMatch *_next;
+
+ trapMatch(uint32 qsLo,
+ uint32 qsHi,
+ uint32 dsLo,
+ uint32 dsHi,
+ merCovering *IL) {
+ _qsLo = qsLo;
+ _qsHi = qsHi;
+ _dsLo = dsLo;
+ _dsHi = dsHi;
+ _IL = IL;
+ _next = 0L;
+ };
+};
+
+class hitMatrix {
+public:
+ hitMatrix(uint32 qsLen,
+ uint32 qsMers,
+ uint32 qsIdx);
+ ~hitMatrix();
+
+ void addHits(uint32 qi,
+ uint64 *ps,
+ uint64 cn);
+
+ void sort_diagonal(void);
+ void sort_dsPos(void);
+
+ void filter(encodedQuery *query, bool isReverse);
+private:
+ uint32 _qsLen; // Seq Len of Q
+ uint32 _qsMers; // Valid mers in Q
+ uint32 _qsIdx; // Index of Q in the FastA
+
+ // Instead of building the lines during add(), we store
+ // the information used to build lines, and then build them
+ // in chain(). This was done to reduce simultaneous memory
+ // usage, as the lineArrayMap and etc take up considerable space.
+ //
+ uint32 _hitsLen;
+ uint32 _hitsMax;
+ diagonalLine *_hits;
+
+
+ // Making sense of the raw output from the search is not a trivial
+ // task for perl. SMALL searches (dbEST vs 0.5MB sequence) used more
+ // than 4GB of memory in perl.
+ //
+ // So, we bite the bullet and do it here.
+ //
+ // _matches is a sorted linked list of the regions we have found.
+ // The list is kept in REVERSE order, as we usually add regions
+ // in the correct order (correct reverse order), occasionally
+ // we need to swap the last two.
+ //
+ // The list is deleted in filter()
+ //
+ trapMatch *_matches;
+
+ void addMatch(uint32 qsLo,
+ uint32 qsHi,
+ uint32 dsLo,
+ uint32 dsHi,
+ merCovering *IL);
+
+};
+
+
+inline
+void
+hitMatrix::addHits(uint32 qi,
+ uint64 *ps,
+ uint64 cn) {
+
+ if ((_hitsLen + cn) >= _hitsMax) {
+ _hitsMax = _hitsMax + _hitsMax + (uint32)cn;
+
+ diagonalLine *h;
+ try {
+ h = new diagonalLine [_hitsMax];
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "hitMatrix::addHits()-- caught std::bad_alloc in %s at line %d.\n", __FILE__, __LINE__);
+ fprintf(stderr, "hitMatrix::addHits()-- have "uint32FMT" hits, tried to add "uint64FMT" more\n", _hitsLen, cn);
+ exit(1);
+ }
+
+ for (uint32 z=_hitsLen; z--; ) {
+ h[z]._qsPos = _hits[z]._qsPos;
+ h[z]._dsPos = _hits[z]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ h[z]._diagonalID = _hits[z]._diagonalID;
+#endif
+ }
+
+ delete [] _hits;
+
+ _hits = h;
+ }
+
+ for (uint64 i=0; i<cn; i++) {
+ _hits[_hitsLen]._qsPos = (uint32)(qi);
+ _hits[_hitsLen]._dsPos = (uint32)(ps[i]);
+#ifndef WITHOUT_DIAGONALID
+ _hits[_hitsLen]._diagonalID = (uint32)(_qsLen - qi - 1 + ps[i]);
+#endif
+ _hitsLen++;
+ }
+}
+
+
+#endif // HITMATRIX_H
diff --git a/seagen/hitReader.C b/seagen/hitReader.C
new file mode 100644
index 0000000..c35d8c9
--- /dev/null
+++ b/seagen/hitReader.C
@@ -0,0 +1,273 @@
+#include "hitReader.H"
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+
+static
+int
+hitCompareCoverage(const void *a, const void *b) {
+ const hit_s *A = (const hit_s *)a;
+ const hit_s *B = (const hit_s *)b;
+
+ if (A->coverage > B->coverage)
+ return(-1);
+ return(A->coverage < B->coverage);
+}
+
+static
+int
+hitCompareGenPos(const void *a, const void *b) {
+ const hit_s *A = (const hit_s *)a;
+ const hit_s *B = (const hit_s *)b;
+
+ if (A->a._dsIdx < B->a._dsIdx) return(-1);
+ if (A->a._dsIdx > B->a._dsIdx) return(1);
+
+ if (A->a._forward < B->a._forward) return(-1);
+ if (A->a._forward > B->a._forward) return(1);
+
+ if (A->a._dsLo < B->a._dsLo) return(-1);
+ if (A->a._dsLo > B->a._dsLo) return(1);
+
+ if (A->a._dsHi < B->a._dsHi) return(-1);
+ if (A->a._dsHi > B->a._dsHi) return(1);
+
+ return(0);
+}
+
+
+
+
+
+
+
+hitReader::hitReader(int m) {
+ _filesMax = m;
+ _filesLen = 0;
+ _files = new hitFile_s [_filesMax];
+
+ _listLen = 0;
+ _listMax = 1024 * 1024;
+ _list = new hit_s [_listMax];
+
+ _iid = uint32ZERO;
+ _bestScore = 0.0;
+ _worstScore = 1.0;
+}
+
+
+hitReader::~hitReader() {
+ for (uint32 i=0; i<_filesLen; i++)
+ //fclose(_files[i].file);
+ delete _files[i].buff;
+
+ delete [] _files;
+ delete [] _list;
+}
+
+void
+hitReader::addInputFile(char *filename) {
+ errno = 0;
+
+ _files[_filesLen].stillMore = true;
+ _files[_filesLen].buff = new readBuffer(filename);
+
+ // Binary or ASCII input?
+ //
+ _files[_filesLen].isBINARY = (_files[_filesLen].buff->peek() != '-');
+
+ // Load the first hit
+ loadHit(_files+_filesLen);
+
+ _filesLen++;
+}
+
+
+void
+hitReader::loadHit(hitFile_s *HF) {
+
+ if (HF->isBINARY) {
+ ahit_readBinary(&HF->a, HF->buff);
+ } else {
+ fprintf(stderr, "ERROR: hitReader::loadHit() ascii not supported right now.\n");
+ exit(1);
+ //fgets(HF->b, 1024, HF->file);
+ //ahit_parseString(&HF->a, HF->b);
+ }
+
+ if (HF->buff->eof())
+ HF->stillMore = false;
+};
+
+
+bool
+hitReader::loadHits(void) {
+
+ _listLen = 0;
+ _iid = uint32ZERO;
+ _bestScore = 0.0;
+ _worstScore = 1.0;
+
+ // See if there are more hits to process.
+ //
+ bool keepGoing = false;
+ for (uint32 i=0; i<_filesLen; i++)
+ keepGoing |= _files[i].stillMore;
+
+ if (keepGoing == false)
+ return(false);
+
+ // Find the lowest ESTid
+ //
+ _iid = 1 << 30;
+ for (uint32 i=0; i<_filesLen; i++)
+ if ((_files[i].stillMore) && (_iid > _files[i].a._qsIdx))
+ _iid = _files[i].a._qsIdx;
+
+
+ // For each file, load the next hit if it's the est
+ // we're looking at
+ //
+ for (uint32 i=0; i<_filesLen; i++) {
+ while ((_files[i].stillMore) && (_files[i].a._qsIdx == _iid)) {
+ if (_listLen >= _listMax) {
+ _listMax *= 2;
+ hit_s *new_list = new hit_s [_listMax];
+ memcpy(new_list, _list, _listLen * sizeof(hit_s));
+ delete [] _list;
+ _list = new_list;
+ }
+
+ memcpy(&_list[_listLen].a, &_files[i].a, sizeof(aHit));
+
+ _list[_listLen].coverage = (double)_files[i].a._covered / (double)_files[i].a._numMers;
+ _list[_listLen].multiplicity = (double)_files[i].a._matched / (double)_files[i].a._covered;
+
+ // aHit->_covered is in bases, but aHit->_numMers is the
+ // number of mers. Possible for coverage to be > 1.0.
+ //
+ if (_list[_listLen].coverage > 1.0)
+ _list[_listLen].coverage = 1.0;
+
+ if (_list[_listLen].coverage > _bestScore)
+ _bestScore = _list[_listLen].coverage;
+
+ if (_list[_listLen].coverage < _worstScore)
+ _worstScore = _list[_listLen].coverage;
+
+#ifdef WITH_ANSWERS
+ // Look for the answer string. If not found, set to zero.
+ //
+ _list[_listLen].mappedIdentity = 0;
+ _list[_listLen].mappedCoverage = 0;
+
+ for (int p=0; _files[i].b[p]; p++) {
+ if ((_files[i].b[p] == 'Y') || (_files[i].b[p] == 'N')) {
+ char *c = _files[i].b+p+1;
+ _list[_listLen].mappedIdentity = (uint32)strtoul(c, &c, 10);
+ _list[_listLen].mappedCoverage = (uint32)strtoul(c, &c, 10);
+ }
+ }
+#endif
+
+ _listLen++;
+
+ loadHit(_files+i);
+ }
+ }
+
+ mergeOverlappingHits();
+
+ return(true);
+}
+
+
+
+void
+hitReader::sortByCoverage(void) {
+ qsort(_list, _listLen, sizeof(hit_s), hitCompareCoverage);
+};
+
+
+
+
+
+// scan the list of hits (for a single EST, remember) and merge
+// any that are overlapping
+//
+void
+hitReader::mergeOverlappingHits(void) {
+
+ // Sort by the genomic position
+ //
+ qsort(_list, _listLen, sizeof(hit_s), hitCompareGenPos);
+
+ // Scan through the list, merging.
+ //
+ uint32 cur = 0; // Currently active entry
+ uint32 exa = 1; // Entry we examine for merging
+ while (exa < _listLen) {
+
+ // Do they overlap?
+ if ((_list[cur].a._dsIdx == _list[exa].a._dsIdx) &&
+ (_list[cur].a._forward == _list[exa].a._forward) &&
+ (_list[cur].a._dsHi >= _list[exa].a._dsLo)) {
+
+ // Yup, merge. Extend the current hit if it is smaller.
+
+ if ((_list[cur].a._dsLo == _list[exa].a._dsLo) &&
+ (_list[cur].a._dsHi == _list[exa].a._dsHi)) {
+ // Nop, they're the same.
+ } else if (_list[cur].a._dsHi >= _list[exa].a._dsHi) {
+ // Nop, exa is contained in cur.
+ } else {
+
+ // exa extends cur!
+
+ // If cur is contained in exa, just get rid of cur.
+ // Otherwise, we need to fudge up new scores -- but we
+ // instead just mark them as merged, and don't filter them.
+ //
+ if (_list[cur].a._dsLo == _list[exa].a._dsLo) {
+ memcpy(_list+cur, _list+exa, sizeof(hit_s));
+ } else {
+#ifdef DEBUG_HITREADER
+ fprintf(stderr,
+ "MERGE: ("uint32FMT","uint32FMT") -e "uint32FMT" "
+ uint32FMT":"uint32FMT"-"uint32FMT"%c("uint32FMT"-"uint32FMT"-"uint32FMT") "
+ uint32FMT":"uint32FMT"-"uint32FMT"%c("uint32FMT"-"uint32FMT"-"uint32FMT")\n",
+ cur, exa,
+ _list[cur].a._qsIdx,
+ _list[cur].a._dsIdx,
+ _list[cur].a._dsLo, _list[cur].a._dsHi, _list[cur].a._forward ? 'f' : 'r',
+ _list[cur].a._covered, _list[cur].a._matched, _list[cur].a._numMers,
+ _list[exa].a._dsIdx,
+ _list[exa].a._dsLo, _list[exa].a._dsHi, _list[exa].a._forward ? 'f' : 'r',
+ _list[exa].a._covered, _list[exa].a._matched, _list[exa].a._numMers);
+#endif
+ _list[cur].a._merged = true;
+ _list[cur].a._covered = 0;
+ _list[cur].a._matched = 0;
+ _list[cur].a._dsHi = _list[exa].a._dsHi;
+ }
+ }
+
+ // By now, we've updated cur to include all that exa did. exa is junk.
+
+ } else {
+ // Nope, copy exa to the next spot (unless they're the same)
+ // and move there.
+ //
+ cur++;
+ if (cur != exa)
+ memcpy(_list+cur, _list+exa, sizeof(hit_s));
+ }
+
+ // Move to the next examination!
+ exa++;
+ }
+
+ _listLen = cur + 1;
+}
diff --git a/seagen/hitReader.H b/seagen/hitReader.H
new file mode 100644
index 0000000..0219700
--- /dev/null
+++ b/seagen/hitReader.H
@@ -0,0 +1,82 @@
+#ifndef HITREADER_H
+#define HITREADER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bio.h"
+#include "aHit.H"
+
+// Define this if your hits have answers and you're curious about how
+// well the filter is performing.
+//
+// This is used in filterEST.C also.
+//
+//#define WITH_ANSWERS
+
+
+// XXX: Lots of this stuff can be moved into hitReader as temporary variables
+//
+typedef struct {
+ bool stillMore;
+ readBuffer *buff;
+ char b[1024];
+ aHit a;
+ bool isBINARY;
+} hitFile_s;
+
+
+typedef struct {
+ aHit a;
+ double coverage;
+ double multiplicity;
+ uint32 estid;
+#ifdef WITH_ANSWERS
+ uint32 mappedIdentity;
+ uint32 mappedCoverage;
+#endif
+} hit_s;
+
+
+
+class hitReader {
+public:
+ hitReader(int m);
+ ~hitReader();
+
+ void addInputFile(char *filename);
+
+ void loadHit(hitFile_s *HF);
+ bool loadHits(void);
+
+ double bestScore(void) { return(_bestScore); };
+ double worstScore(void) { return(_worstScore); };
+ uint32 iid(void) { return(_iid); };
+ uint32 numHits(void) { return(_listLen); };
+
+ void sortByCoverage(void);
+ void mergeOverlappingHits(void);
+
+ hit_s &operator[](uint32 x) {
+ if (x >= _listLen) {
+ fprintf(stderr, "hitReader::operator[]()-- ERROR: asked for hit "uint32FMT" out of "uint32FMT".\n", x, _listLen);
+ exit(1);
+ }
+
+ return(_list[x]);
+ };
+private:
+ uint32 _filesMax;
+ uint32 _filesLen;
+ hitFile_s *_files;
+
+ uint32 _listLen;
+ uint32 _listMax;
+ hit_s *_list;
+
+ double _bestScore;
+ double _worstScore;
+ uint32 _iid;
+};
+
+#endif // HITREADER_H
diff --git a/seagen/misc/dumpCrapSeqs.C b/seagen/misc/dumpCrapSeqs.C
new file mode 100644
index 0000000..624a5bd
--- /dev/null
+++ b/seagen/misc/dumpCrapSeqs.C
@@ -0,0 +1,42 @@
+#include "posix.H"
+#include "searchGENOME.H"
+
+int
+main(int argc, char **argv) {
+
+ if (argc == 0) {
+ }
+
+ uint32 zero = 0;
+ uint32 totl = 0;
+
+ FastABuffer B;
+ FastA *F = new FastA(argv[1]);
+ encodedQuery *Q = 0L;
+
+ for (F->first(B); !F->eof(); F->next(B)) {
+ if ((totl & 0xfff) == 0xfff) {
+ fprintf(stderr, "%9lu / %9lu\r", totl, zero);
+ fflush(stderr);
+ }
+
+ Q = new encodedQuery(B.sequence(),
+ B.sequenceLength(),
+ 20,
+ false);
+
+ totl++;
+
+ if (Q->numberOfMers() == 0) {
+ zero++;
+ }
+
+ delete Q;
+ }
+
+ fprintf(stdout, "\n");
+ fprintf(stdout, "Total: %9lu\n", totl);
+ fprintf(stdout, "Zero: %9lu\n", zero);
+
+ return(0);
+}
diff --git a/seagen/misc/f.C b/seagen/misc/f.C
new file mode 100644
index 0000000..c1fafc3
--- /dev/null
+++ b/seagen/misc/f.C
@@ -0,0 +1,137 @@
+#include "posix.H"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include "aHit.H"
+
+// Filters a hit file based on the length of the genomic region
+
+typedef struct {
+ uint32 dir;
+ uint32 estID;
+ uint32 scfID;
+ uint32 scfLo;
+ uint32 scfHi;
+} hit_s;
+
+
+
+int
+main(int argc, char **argv) {
+
+ if (argc < 2) {
+ fprintf(stderr, "usage: %s ....\n", argv[0]);
+ exit(1);
+ }
+
+ int arg = 1;
+
+ // Things for reading hits
+ //
+ FILE *file;
+ char b[1024];
+ aHit a;
+ bool isBINARY;
+ int histogram[12] = {0};
+ FILE *outf[12];
+
+ outf[0] = fopen("filteredHits.0", "w");
+ outf[1] = fopen("filteredHits.1", "w");
+ outf[2] = fopen("filteredHits.2", "w");
+ outf[3] = fopen("filteredHits.3", "w");
+ outf[4] = fopen("filteredHits.4", "w");
+ outf[5] = fopen("filteredHits.5", "w");
+ outf[6] = fopen("filteredHits.6", "w");
+ outf[7] = fopen("filteredHits.7", "w");
+ outf[8] = fopen("filteredHits.8", "w");
+ outf[9] = fopen("filteredHits.9", "w");
+ outf[10] = fopen("filteredHits.a", "w");
+ outf[11] = fopen("filteredHits.b", "w");
+
+ while (arg < argc) {
+
+ // Open the file, fatally failing if we cannot do it.
+ //
+ errno = 0;
+ file = fopen(argv[arg], "r");
+ if (file == 0L) {
+ fprintf(stderr, "ESTmapper/filterEST-- ERROR opening '%s'\n%s\n", argv[arg], strerror(errno));
+ exit(1);
+ }
+
+ // Binary or ASCII input?
+ //
+ char x = (char)fgetc(file);
+ ungetc(x, file);
+
+ isBINARY = (x != '-');
+
+ if (isBINARY)
+ fprintf(stderr, "reading BINARY hits from '%s'\n", argv[arg]);
+ else
+ fprintf(stderr, "reading ASCII hits from '%s'\n", argv[arg]);
+
+ // Read hits until we run out of space
+ //
+ while (!feof(file)) {
+ if (isBINARY) {
+ ahit_readBinary(&a, file);
+ } else {
+ fgets(b, 1024, file);
+ ahit_parseString(&a, b);
+ }
+
+ // Fill the histogram
+ //
+ int len = a._dsHi - a._dsLo;
+ if (len < 25000) {
+ fprintf(outf[0], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[0]++;
+ } else if (len < 50000) {
+ fprintf(outf[1], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[1]++;
+ } else if (len < 100000) {
+ fprintf(outf[2], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[2]++;
+ } else if (len < 200000) {
+ fprintf(outf[3], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[3]++;
+ } else if (len < 400000) {
+ fprintf(outf[4], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[4]++;
+ } else if (len < 800000) {
+ fprintf(outf[5], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[5]++;
+ } else if (len < 1600000) {
+ fprintf(outf[6], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[6]++;
+ } else if (len < 3200000) {
+ fprintf(outf[7], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[7]++;
+ } else if (len < 6400000) {
+ fprintf(outf[8], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[8]++;
+ } else if (len < 12800000) {
+ fprintf(outf[9], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[9]++;
+ } else if (len < 25600000) {
+ fprintf(outf[10], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[10]++;
+ } else {
+ fprintf(outf[11], "-%c -e %u -D %u %u %u\n", a._direction ? 'f' : 'r', a._qsIdx, a._dsIdx, a._dsLo, a._dsHi);
+ histogram[11]++;
+ }
+ }
+
+ fclose(file);
+
+ arg++;
+ }
+
+ for (int i=0; i<12; i++)
+ fprintf(stderr, "%2d] %d\n", i, histogram[i]);
+
+ return(0);
+}
diff --git a/seagen/misc/h.C b/seagen/misc/h.C
new file mode 100644
index 0000000..cd3b1ed
--- /dev/null
+++ b/seagen/misc/h.C
@@ -0,0 +1,105 @@
+#include "posix.H"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include "aHit.H"
+
+// Generates a histogram of a hit file
+
+typedef struct {
+ uint32 dir;
+ uint32 estID;
+ uint32 scfID;
+ uint32 scfLo;
+ uint32 scfHi;
+} hit_s;
+
+
+int
+main(int argc, char **argv) {
+
+ if (argc < 2) {
+ fprintf(stderr, "usage: %s ....\n", argv[0]);
+ exit(1);
+ }
+
+ int arg = 1;
+
+ // Things for reading hits
+ //
+ FILE *file;
+ char b[1024];
+ aHit a;
+ bool isBINARY;
+ int histogram[10] = {0};
+
+ while (arg < argc) {
+
+ // Open the file, fatally failing if we cannot do it.
+ //
+ errno = 0;
+ file = fopen(argv[arg], "r");
+ if (file == 0L) {
+ fprintf(stderr, "ESTmapper/filterEST-- ERROR opening '%s'\n%s\n", argv[arg], strerror(errno));
+ exit(1);
+ }
+
+ // Binary or ASCII input?
+ //
+ char x = (char)fgetc(file);
+ ungetc(x, file);
+
+ isBINARY = (x != '-');
+
+ if (isBINARY)
+ fprintf(stderr, "reading BINARY hits from '%s'\n", argv[arg]);
+ else
+ fprintf(stderr, "reading ASCII hits from '%s'\n", argv[arg]);
+
+ // Read hits until we run out of space
+ //
+ while (!feof(file)) {
+ if (isBINARY) {
+ ahit_readBinary(&a, file);
+ } else {
+ fgets(b, 1024, file);
+ ahit_parseString(&a, b);
+ }
+
+ // Fill the histogram
+ //
+ int len = a._dsHi - a._dsLo;
+ if (len < 25000)
+ histogram[0]++;
+ else if (len < 50000)
+ histogram[1]++;
+ else if (len < 100000)
+ histogram[2]++;
+ else if (len < 200000)
+ histogram[3]++;
+ else if (len < 400000)
+ histogram[4]++;
+ else if (len < 800000)
+ histogram[5]++;
+ else if (len < 1600000)
+ histogram[6]++;
+ else if (len < 3200000)
+ histogram[7]++;
+ else if (len < 6400000)
+ histogram[8]++;
+ else
+ histogram[9]++;
+ }
+
+ fclose(file);
+
+ arg++;
+ }
+
+ for (int i=0; i<10; i++)
+ fprintf(stderr, "%2d] %d\n", i, histogram[i]);
+
+ return(0);
+}
diff --git a/seagen/posix.H b/seagen/posix.H
new file mode 100644
index 0000000..e69de29
diff --git a/seagen/searchGENOME.C b/seagen/searchGENOME.C
new file mode 100644
index 0000000..6bb950e
--- /dev/null
+++ b/seagen/searchGENOME.C
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <new>
+#include "searchGENOME.H"
+
+configuration config;
+
+
+void *loaderThread(void *U);
+void searchThread(void *U, void *T, void *Q);
+void writerThread(void *U, void *Q);
+
+
+
+int
+main(int argc, char **argv) {
+
+ // Read the configuration from the command line
+ //
+ config.read(argc, argv);
+
+ // Open and init the query sequence
+ //
+ if (config._beVerbose)
+ fprintf(stderr, "Opening the cDNA sequences.\n");
+
+ config._qsFASTA = new seqCache(config._qsFileName);
+ config._dbSTREAM = new seqStream(config._dbFileName);
+
+ // Complete the configuration
+ //
+ config._initTime = getTime();
+
+
+ //
+ // Build the positions
+ //
+
+ // Read in the positionDB if it's already built, or build a new one.
+ //
+ if ((config._tableFileName) && (fileExists(config._tableFileName))) {
+ if (config._tableBuildOnly) {
+ fprintf(stderr, "All done. Table '%s' already built.\n", config._tableFileName);
+ exit(1);
+ } else {
+ fprintf(stderr, "Loading positionDB state from '%s'\n", config._tableFileName);
+ config._positions = new positionDB(config._tableFileName, config._merSize, config._merSkip, 0);
+ }
+ } else {
+ merStream *MS = new merStream(new kMerBuilder(config._merSize),
+ config._dbSTREAM,
+ true, false);
+ config._positions = new positionDB(MS, config._merSize, config._merSkip, 0L, 0L, 0L, 0, 0, 0, 0, config._beVerbose);
+ delete MS;
+
+ if (config._tableFileName) {
+ if (config._beVerbose)
+ fprintf(stderr, "Dumping positions table to '%s'\n", config._tableFileName);
+
+ config._positions->saveState(config._tableFileName);
+
+ if (config._tableBuildOnly)
+ exit(0);
+ }
+ }
+
+ // Build the masking database.
+ //
+ // Previous versions build the existDB takeing the posDB as a
+ // parameter. The existDB would then be exclude mers not in the
+ // posDB. A neat and nice feature, but with only 45,000 to 70,000
+ // mers in the masks, hardly worth the effort.
+ //
+ if (config._maskFileName) {
+ if (config._beVerbose)
+ fprintf(stderr, "Building maskDB from '%s'\n", config._maskFileName);
+ config._maskDB = new existDB(config._maskFileName, config._merSize, existDBnoFlags, 0, ~uint32ZERO);
+ }
+
+ if (config._onlyFileName) {
+ if (config._beVerbose)
+ fprintf(stderr, "Building onlyDB from '%s'\n", config._onlyFileName);
+ config._onlyDB = new existDB(config._onlyFileName, config._merSize, existDBnoFlags, 0, ~uint32ZERO);
+ }
+
+ config._buildTime = getTime();
+
+#if 0
+ // Maybe we don't need this anymore!
+#ifdef __alpha
+ // Start the deadlock detection threads
+ //
+ fprintf(stderr, "Deadlock detection enabled!\n");
+ pthread_create(&threadID, &threadAttr, deadlockDetector, 0L);
+ pthread_create(&threadID, &threadAttr, deadlockChecker, 0L);
+#endif
+#endif
+
+ sweatShop *ss = new sweatShop(loaderThread,
+ searchThread,
+ writerThread);
+
+ ss->setNumberOfWorkers(config._numSearchThreads);
+
+ for (uint32 i=0; i<config._numSearchThreads; i++)
+ ss->setThreadData(i, new searcherState);
+
+ ss->setLoaderQueueSize(config._loaderQueue);
+ ss->setWriterQueueSize(config._writerQueue);
+
+ ss->run(0L, config._beVerbose);
+
+ config._searchTime = getTime();
+
+ // the configuration does most cleanup, and it's on the stack.
+
+ return(0);
+}
+
diff --git a/seagen/searchGENOME.H b/seagen/searchGENOME.H
new file mode 100644
index 0000000..ac0fc2d
--- /dev/null
+++ b/seagen/searchGENOME.H
@@ -0,0 +1,29 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/utsname.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <math.h>
+
+// At one time, this was needed for pthread.h or semaphore.h
+//typedef unsigned short ushort;
+
+#include <pthread.h>
+#include <semaphore.h>
+
+#include "util++.H"
+#include "bio++.H"
+#include "positionDB.H"
+#include "existDB.H"
+
+#include "hitMatrix.H"
+#include "searcherState.H"
+
+#include "configuration.H"
+
diff --git a/seagen/searcherState.H b/seagen/searcherState.H
new file mode 100644
index 0000000..f8ba11a
--- /dev/null
+++ b/seagen/searcherState.H
@@ -0,0 +1,31 @@
+
+class searcherState {
+public:
+ uint64 posnMax;
+ uint64 posnLen;
+ uint64 *posn;
+
+#ifdef __APPLE__
+ uint32 pad;
+#endif
+
+ double encodeTime;
+ double maskTime;
+ double searchTime;
+ double filterTime;
+
+ searcherState() {
+ posnMax = 16384;
+ posnLen = 0;
+ posn = new uint64 [ posnMax ];
+
+ encodeTime = 0.0;
+ maskTime = 0.0;
+ searchTime = 0.0;
+ filterTime = 0.0;
+ };
+
+ ~searcherState() {
+ delete [] posn;
+ };
+};
diff --git a/seagen/sortHits.C b/seagen/sortHits.C
new file mode 100644
index 0000000..5993d70
--- /dev/null
+++ b/seagen/sortHits.C
@@ -0,0 +1,297 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include "aHit.H"
+#include "bio++.H"
+
+
+
+
+// Command line options. Only tmpPath needs to be global, and it can
+// be easily localized.
+//
+bool beVerbose = false;
+uint64 memoryLimit = 128 * 1024 * 1024;
+char *tmpPath = 0L;
+
+
+
+class aHitReader {
+public:
+
+ // Open the file for reading, testing if it's binary or ascii input
+ //
+ aHitReader(char *filename) {
+ errno = 0;
+ theFile = fopen(filename, "r");
+ if (theFile == 0L) {
+ fprintf(stderr, "sortHits-- ERROR opening '%s': %s\n", filename, strerror(errno));
+ exit(1);
+ }
+
+ char x = (char)fgetc(theFile);
+ ungetc(x, theFile);
+
+ isBinary = (x != '-');
+
+ if (!isBinary)
+ buffer = new char [1024];
+ };
+
+ ~aHitReader() {
+ fclose(theFile);
+ delete [] buffer;
+ };
+
+ bool readHit(aHit &hit) {
+ if (isBinary) {
+ ahit_readBinary(&hit, theFile);
+ } else {
+ fgets(buffer, 1024, theFile);
+ ahit_parseString(&hit, buffer);
+ }
+
+ return(feof(theFile) == false);
+ };
+private:
+ FILE *theFile;
+ char *buffer;
+ bool isBinary;
+};
+
+
+
+// Write a bunch of hits to a temporary file (unlink the file after
+// it's opened) then allow those hits to be read back in. Doesn't
+// need the aHitReader, as we use just the binary format.
+//
+class aHitTemporary {
+public:
+ aHitTemporary(aHit *hits, uint32 hitsLen) {
+ theFile = makeTempFile(tmpPath);
+
+ // XXX: Known bug on Tru64: fwrite() of data blocks > 2GB is broken
+
+ uint32 outputPos = 0;
+ uint32 outputLen = 1024 * 1024 / sizeof(aHit);
+
+ while (outputPos < hitsLen) {
+ errno = 0;
+ outputPos += fwrite(hits, sizeof(aHit), hitsLen, theFile);
+ if (errno) {
+ fprintf(stderr, "ERROR: sortHits()-- Failed to write temporary file: %s\n", strerror(errno));
+ exit(1);
+ }
+
+ // XXX: do we write one too many?
+
+ if (outputPos + outputLen > hitsLen)
+ outputLen = hitsLen - outputPos;
+ }
+
+ rewind(theFile);
+
+ hit._forward = false;
+ hit._merged = false;
+ hit._qsIdx = uint32ZERO;
+ hit._dsIdx = uint32ZERO;
+ hit._dsLo = uint32ZERO;
+ hit._dsHi = uint32ZERO;
+ hit._covered = uint32ZERO;
+ hit._matched = uint32ZERO;
+ hit._numMers = uint32ZERO;
+
+ nextHit();
+ };
+
+ ~aHitTemporary() {
+ fclose(theFile);
+ };
+
+ aHit *theHit(void) {
+ return(&hit);
+ };
+ void nextHit(void) {
+ if (hit._qsIdx != ~uint32ZERO) {
+ errno = 0;
+ fread(&hit, sizeof(aHit), 1, theFile);
+ if (errno) {
+ fprintf(stderr, "ERROR: sortHits()-- Failed to read a hit: %s\n", strerror(errno));
+ exit(1);
+ }
+
+ // If we hit eof, this hit is invalid, and so are all future ones. Set
+ // hit to be junk.
+ //
+ if (feof(theFile)) {
+ hit._forward = false;
+ hit._merged = false;
+ hit._qsIdx = ~uint32ZERO;
+ hit._dsIdx = ~uint32ZERO;
+ hit._dsLo = ~uint32ZERO;
+ hit._dsHi = ~uint32ZERO;
+ hit._covered = ~uint32ZERO;
+ hit._matched = ~uint32ZERO;
+ hit._numMers = ~uint32ZERO;
+ }
+ }
+ };
+private:
+ FILE *theFile;
+ aHit hit;
+};
+
+
+
+
+
+
+int
+hitcmp(const void *a, const void *b) {
+ aHit *A = (aHit *)a;
+ aHit *B = (aHit *)b;
+
+ if (A->_dsIdx < B->_dsIdx) return(-1);
+ if (A->_dsIdx > B->_dsIdx) return(1);
+ if (A->_qsIdx < B->_qsIdx) return(-1);
+ if (A->_qsIdx > B->_qsIdx) return(1);
+ if (A->_dsLo < B->_dsLo) return(-1);
+ if (A->_dsLo > B->_dsLo) return(1);
+ return(0);
+}
+
+
+
+
+int
+main(int argc, char **argv) {
+
+ if (argc < 4) {
+ fprintf(stderr, "usage: %s [-v] [-m memorylimit] [-t temppath] hitfile1 hitfile2 ... > sorted-hits\n", argv[0]);
+ fprintf(stderr, " memory limit is MB\n");
+ exit(1);
+ }
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-v", 2) == 0) {
+ beVerbose = true;
+ } else if (strncmp(argv[arg], "-m", 2) == 0) {
+ arg++;
+ memoryLimit = atoi(argv[arg]);
+ memoryLimit <<= 20;
+ } else if (strncmp(argv[arg], "-t", 2) == 0) {
+ arg++;
+ tmpPath = argv[arg];
+ } else {
+ // Must be at the first file name. Break.
+ break;
+ }
+ arg++;
+ }
+
+ // Allocate a bunch of spaces to store hits.
+ //
+ uint64 hitsMax = memoryLimit / sizeof(aHit);
+ uint32 hitsPos = 0;
+ aHit *hits = new aHit [hitsMax];
+
+ uint32 tmpFlen = 0;
+ uint32 tmpFmax = 1024;
+ aHitTemporary **tmpF = new aHitTemporary * [tmpFmax];
+
+ while (arg < argc) {
+ aHitReader *R = new aHitReader(argv[arg]);
+ arg++;
+
+ // Read hits until we exhaust out space, then sort and dump to disk.
+ //
+ while (R->readHit(hits[hitsPos])) {
+ hitsPos++;
+
+ if (hitsPos == hitsMax) {
+ qsort(hits, hitsPos, sizeof(aHit), hitcmp);
+
+ if (tmpFlen >= tmpFmax) {
+ tmpFmax *= 2;
+ aHitTemporary **tmp = new aHitTemporary * [tmpFmax];
+ memcpy(tmp, tmpF, sizeof(aHitTemporary) * tmpFlen);
+ delete [] tmpF;
+ tmpF = tmp;
+ }
+
+ tmpF[tmpFlen] = new aHitTemporary(hits, hitsPos);
+
+ tmpFlen++;
+ hitsPos = 0;
+ }
+ }
+
+ delete R;
+ }
+
+ // All done reading. If we have stuff to sort, sort it.
+ //
+ if (hitsPos > 0)
+ qsort(hits, hitsPos, sizeof(aHit), hitcmp);
+
+ // No temporary files? Just write the hits and exit. We're done.
+ //
+ if (tmpFlen == 0) {
+ for (uint32 i=0; i<hitsPos; i++)
+ ahit_printASCII(hits+i, stdout);
+ exit(0);
+ }
+
+
+ // We have at least one temporary file already on disk, so to make things
+ // easier, we write out the current set of hits and do an all disk merge.
+
+
+ if (tmpFlen >= tmpFmax) {
+ tmpFmax *= 2;
+ aHitTemporary **tmp = new aHitTemporary * [tmpFmax];
+ memcpy(tmp, tmpF, sizeof(aHitTemporary) * tmpFlen);
+ delete [] tmpF;
+ tmpF = tmp;
+ }
+
+ tmpF[tmpFlen] = new aHitTemporary(hits, hitsPos);
+
+ tmpFlen++;
+
+
+
+ // While there is still input, merge to the output
+ //
+ bool moreInput = true;
+
+ while (moreInput) {
+
+ // Pick the smallest hit -- if file [i] is finished, then hit[i]
+ // is bogus and all the values are set to maximal values.
+ //
+ uint32 smallestHit = 0;
+
+ for (uint32 nh = smallestHit+1; nh < tmpFlen; nh++) {
+ if (hitcmp(tmpF[smallestHit]->theHit(), tmpF[nh]->theHit()) > 0)
+ smallestHit = nh;
+ }
+
+ // If the smallest hit is invalid, we're done. Otherwise, write
+ // the hit, and read a new one.
+ //
+ if (tmpF[smallestHit]->theHit()->_qsIdx == ~uint32ZERO) {
+ moreInput = false;
+ } else {
+ ahit_printASCII(tmpF[smallestHit]->theHit(), stdout);
+ tmpF[smallestHit]->nextHit();
+ }
+ }
+
+ // Should clean up, I know.
+
+ return(0);
+}
diff --git a/seagen/test/encodedQueryTest.C b/seagen/test/encodedQueryTest.C
new file mode 100644
index 0000000..11d223f
--- /dev/null
+++ b/seagen/test/encodedQueryTest.C
@@ -0,0 +1,48 @@
+#include "bio++.H"
+#include "encodedQuery.H"
+
+int
+main(int argc, char **argv) {
+
+ if (argc == 1) {
+ mt_s *mt = mtInit(time(0L));
+
+ fprintf(stderr, "Building random sequences for testing.\n");
+
+ for (uint32 i=0; i<100000; i++) {
+ char *seq = new char [10000];
+ char *hdr = new char [128];
+
+ for (uint32 j=0; j<10000; j++) {
+ seq[j] = decompressSymbol[mtRandom32(mt) % 4];
+ if (mtRandomRealOpen(mt) < 0.01)
+ seq[j] = 'n';
+ }
+ seq[9999] = 0;
+
+ sprintf(hdr, ">"uint32FMT, i);
+
+ seqInCore *S = new seqInCore(i, hdr, strlen(hdr), seq, 9999);
+ encodedQuery *Q = new encodedQuery(S, 22);
+ Q->test(S);
+ delete Q;
+ delete S;
+ }
+
+ } else {
+ seqCache *F = new seqCache(argv[1]);
+
+ while (F->eof() == false) {
+ seqInCore *S = F->getSequenceInCore();
+ encodedQuery *Q = new encodedQuery(S, 22);
+ Q->test(S);
+ delete Q;
+ delete S;
+ }
+
+ delete F;
+ }
+
+
+ exit(0);
+}
diff --git a/seagen/test/intervalList-test.C b/seagen/test/intervalList-test.C
new file mode 100644
index 0000000..8ad2885
--- /dev/null
+++ b/seagen/test/intervalList-test.C
@@ -0,0 +1,110 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <math.h>
+#include <sys/time.h>
+
+#define TEST_INTERVAL_LIST
+#define TEST_SIZE 2000
+#define TEST_ITERS 1000
+
+#include "libbri.H"
+#include "intervalList.H"
+
+void
+fixedTest(void) {
+ intervalList G(10);
+
+ G.addInterval(110); fprintf(stderr, "Adding %3d -> %3d:\t", 110, 110+10); G.dump();
+ G.addInterval(130); fprintf(stderr, "Adding %3d -> %3d:\t", 130, 130+10); G.dump();
+ G.addInterval(105); fprintf(stderr, "Adding %3d -> %3d:\t", 105, 105+10); G.dump();
+ G.addInterval(115); fprintf(stderr, "Adding %3d -> %3d:\t", 115, 115+10); G.dump();
+ G.addInterval(124); fprintf(stderr, "Adding %3d -> %3d:\t", 124, 124+10); G.dump();
+ G.addInterval( 50); fprintf(stderr, "Adding %3d -> %3d:\t", 50, 50+10); G.dump();
+ G.addInterval(200); fprintf(stderr, "Adding %3d -> %3d:\t", 200, 200+10); G.dump();
+ G.addInterval(150); fprintf(stderr, "Adding %3d -> %3d:\t", 150, 150+10); G.dump();
+ G.addInterval(205); fprintf(stderr, "Adding %3d -> %3d:\t", 205, 205+10); G.dump();
+ G.addInterval(195); fprintf(stderr, "Adding %3d -> %3d:\t", 195, 195+10); G.dump();
+ G.addInterval( 61); fprintf(stderr, "Adding %3d -> %3d:\t", 61, 61+10); G.dump();
+ G.addInterval( 72); fprintf(stderr, "Adding %3d -> %3d:\t", 72, 72+10); G.dump();
+ G.addInterval( 65); fprintf(stderr, "Adding %3d -> %3d:\t", 65, 65+10); G.dump();
+ G.addInterval( 83); fprintf(stderr, "Adding %3d -> %3d:\t", 83, 83+10); G.dump();
+ G.addInterval( 94); fprintf(stderr, "Adding %3d -> %3d:\t", 94, 94+10); G.dump();
+ G.addInterval( 75); fprintf(stderr, "Adding %3d -> %3d:\t", 75, 75+10); G.dump();
+ G.addInterval( 84); fprintf(stderr, "Adding %3d -> %3d:\t", 84, 84+10); G.dump();
+ G.addInterval(104); fprintf(stderr, "Adding %3d -> %3d:\t", 104, 104+10); G.dump();
+ G.addInterval(114); fprintf(stderr, "Adding %3d -> %3d:\t", 114, 114+10); G.dump();
+ G.addInterval(124); fprintf(stderr, "Adding %3d -> %3d:\t", 124, 124+10); G.dump();
+ G.addInterval(134); fprintf(stderr, "Adding %3d -> %3d:\t", 134, 134+10); G.dump();
+ G.addInterval(144); fprintf(stderr, "Adding %3d -> %3d:\t", 144, 144+10); G.dump();
+ G.addInterval( 51); fprintf(stderr, "Adding %3d -> %3d:\t", 51, 51+10); G.dump();
+ G.addInterval(161); fprintf(stderr, "Adding %3d -> %3d:\t", 161, 161+10); G.dump();
+ G.addInterval(172); fprintf(stderr, "Adding %3d -> %3d:\t", 172, 172+10); G.dump();
+ G.addInterval(183); fprintf(stderr, "Adding %3d -> %3d:\t", 183, 183+10); G.dump();
+ G.addInterval(156); fprintf(stderr, "Adding %3d -> %3d:\t", 156, 156+10); G.dump();
+ G.addInterval(166); fprintf(stderr, "Adding %3d -> %3d:\t", 166, 166+10); G.dump();
+ G.addInterval(176); fprintf(stderr, "Adding %3d -> %3d:\t", 176, 176+10); G.dump();
+ G.addInterval(186); fprintf(stderr, "Adding %3d -> %3d:\t", 186, 186+10); G.dump();
+ G.addInterval( 0); fprintf(stderr, "Adding %3d -> %3d:\t", 0, 0+10); G.dump();
+ G.addInterval( 0); fprintf(stderr, "Adding %3d -> %3d:\t", 0, 0+10); G.dump();
+ G.addInterval( 1); fprintf(stderr, "Adding %3d -> %3d:\t", 1, 1+10); G.dump();
+ G.addInterval( 2); fprintf(stderr, "Adding %3d -> %3d:\t", 2, 2+10); G.dump();
+ G.addInterval(300); fprintf(stderr, "Adding %3d -> %3d:\t", 300, 300+10); G.dump();
+ G.addInterval(320); fprintf(stderr, "Adding %3d -> %3d:\t", 320, 320+10); G.dump();
+ G.addInterval(280); fprintf(stderr, "Adding %3d -> %3d:\t", 280, 280+10); G.dump();
+ G.addInterval( 20); fprintf(stderr, "Adding %3d -> %3d:\t", 20, 20+10); G.dump();
+}
+
+
+void
+main(int argc, char **argv) {
+
+ fixedTest();
+
+ srand48(237831);
+
+loop:
+
+
+#if 0
+ intervalList *G = new intervalList(10);
+ for (uint32 i=0; i<TEST_ITERS; i++) {
+ G->addInterval(floor(drand48() * (TEST_SIZE - 10)));
+ G->test();
+ }
+ G->dump();
+ delete G;
+#endif
+
+ intervalList *A = new intervalList(10);
+ intervalList *B = new intervalList(10);
+ intervalList *C = new intervalList(10);
+
+ for (uint32 i=0; i<TEST_ITERS; i++) {
+ uint32 j = floor(drand48() * (TEST_SIZE - 10));
+
+ C->addInterval(j);
+ if (drand48() < 0.5)
+ A->addInterval(j);
+ else
+ B->addInterval(j);
+ }
+
+ fprintf(stderr, "A & B ----------------------------------------\n");
+ A->dump();
+ B->dump();
+
+ A->merge(B);
+
+ fprintf(stderr, "A & C ----------------------------------------\n");
+ A->dump();
+ C->dump();
+
+ A->compare(C);
+
+ delete A;
+ delete B;
+ delete C;
+
+ goto loop;
+}
diff --git a/seagen/thr-deadlock.C b/seagen/thr-deadlock.C
new file mode 100644
index 0000000..b194f2f
--- /dev/null
+++ b/seagen/thr-deadlock.C
@@ -0,0 +1,470 @@
+#include "searchGENOME.H"
+
+// OSF/1 on Compaq Alpha has, in the past, gotten stuck in a deadlock
+// situation allocating memory. There's lots of debugging stuff at
+// the end if this file.
+
+#ifdef __alpha
+
+// Define this to kill the process with a vengance instead of
+// gracefully exiting. exit() tries to free memory, and is thus gets
+// caught in the deadlock -- but is useful for debugging.
+//
+#define KILL_INSTEAD_OF_EXIT
+
+#ifdef KILL_INSTEAD_OF_EXIT
+#include <signal.h>
+#endif
+
+uint32 deadlockTested = 0;
+uint32 deadlockPassed = 0;
+
+void*
+deadlockDetector(void *) {
+
+ fprintf(stderr, "Hello! I'm a deadlockDetector!\n");
+
+ detectAgain:
+
+ // Wait for the deadlock checker to reset things
+ //
+ while ((deadlockTested == 1) || (deadlockPassed == 1))
+ sleep(4);
+
+ deadlockTested = 1;
+ char *x = new char [16];
+ delete [] x;
+ deadlockPassed = 1;
+
+ goto detectAgain;
+
+ return(0L); // Ignore the warning!
+}
+
+void*
+deadlockChecker(void *) {
+
+ fprintf(stderr, "Hello! I'm a deadlockChecker!\n");
+
+ checkAgain:
+
+ // Wait for the tester to test
+ //
+ while (deadlockTested == 0)
+ sleep(5);
+
+ // Give it another ten seconds to return
+ //
+ sleep(5);
+
+ if (deadlockPassed == 0) {
+ fprintf(stderr, "\n\n\nESTmapper/search-- Deadlock detected! Aborting the process!\n\n");
+ fflush(stderr);
+#ifdef KILL_INSTEAD_OF_EXIT
+ kill(getpid(), SIGKILL);
+#endif
+ exit(1);
+ }
+
+ //fprintf(stderr, "Deadlock OK\n");
+
+ // Reset the testing/checking flags
+ //
+ deadlockPassed = 0;
+ deadlockTested = 0;
+
+ goto checkAgain;
+
+ return(0L); // Ignore the warning!
+}
+
+#endif // _alpha
+
+
+
+
+
+
+
+
+
+
+#ifdef DONT_EVER_ENABLE_THIS
+
+//
+// Here are some notes on what was tried, and the stack trace from a lock.
+// This test failed to find the cause.
+//
+
+#define SIZE (16 * 1024 * 1024)
+
+void*
+mallocStressor(void *) {
+ struct timespec sleepAmt = { 0, 10000 };
+ unsigned long v = 0;
+
+ fprintf(stderr, "Hello! I'm a mallocStressor!\n");
+
+ mallocAgain:
+ //nanosleep(&sleepAmt, 0L);
+ char *x = new char [SIZE];
+ for (unsigned int i=SIZE; i--; )
+ x[i] = i >> 5;
+ for (unsigned int i=SIZE; i--; )
+ x[i] |= x[SIZE-i];
+ for (unsigned int i=SIZE; i--; )
+ v += x[i];
+ delete [] x;
+
+ goto mallocAgain;
+
+ return((void*)v); // Ignore the warning!
+}
+
+void
+main(int argc, char **argv) {
+ pthread_attr_t threadAttr;
+ pthread_t threadID;
+
+ pthread_attr_init(&threadAttr);
+ pthread_attr_setscope(&threadAttr, PTHREAD_SCOPE_SYSTEM);
+ pthread_attr_setdetachstate(&threadAttr, PTHREAD_CREATE_DETACHED);
+ pthread_attr_setschedpolicy(&threadAttr, SCHED_OTHER);
+
+ pthread_create(&threadID, &threadAttr, deadlockDetector, 0L);
+ pthread_create(&threadID, &threadAttr, deadlockChecker, 0L);
+
+ for (unsigned int i=0; i<16; i++)
+ pthread_create(&threadID, &threadAttr, mallocStressor, (void *)i);
+
+ sleep(100);
+}
+
+//
+//
+// Stack trace #1
+//
+//
+
+(ladebug) show thread
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+* 1 default thread blocked kern usleep SCHED_OTHER 19
+ -1 manager thread blk SCS SCHED_RR 19
+ -2 null thread for VP 2 running VP 2 null thread -1
+> 2 <anonymous> blocked kern usleep SCHED_OTHER 19
+ -3 null thread for VP 3 running VP 3 null thread -1
+ 3 <anonymous> blocked mut 9 SCHED_OTHER 19
+ -4 null thread for VP 4 running VP 4 null thread -1
+ 4 <anonymous> blocked mut 9 SCHED_OTHER 19
+ -5 null thread for VP 5 running VP 5 null thread -1
+ 5 <anonymous> blocked mut 9 SCHED_OTHER 19
+ 6 <anonymous> blocked mut 9 SCHED_OTHER 19
+(ladebug) thread 6
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+> 6 <anonymous> blocked mut 9 SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff805caf3c in __hstTransferRegisters(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x20001e2fc40) in /usr/shlib/libpthread.so
+#1 0x3ff805af74c in __osTransferContext(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x20001e2fc40) in /usr/shlib/libpthread.so
+#2 0x3ff805a3c50 in __dspTransferContext(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x20001e2fc40) in /usr/shlib/libpthread.so
+#3 0x3ff805a12f4 in __dspDispatch(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x20001e2fc40) in /usr/shlib/libpthread.so
+#4 0x3ff805ab90c in UnknownProcedure32FromFile8(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x20001e2fc40) in /usr/shlib/libpthread.so
+#5 0x3ff805ab2f4 in UnknownProcedure31FromFile8(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x20001e2fc40) in /usr/shlib/libpthread.so
+#6 0x3ff805abe3c in UnknownProcedure34FromFile8(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x20001e2fc40) in /usr/shlib/libpthread.so
+#7 0x3ff801bf6a0 in UnknownProcedure2FromFile22(0x3ffc01b8400, 0x3ff805ae6d0, 0xfffffffffffffffc, 0x40100000, 0x222384c, 0x20001e2fc40) in /usr/shlib/lib
+c.so
+#8 0x3ff800cdad4 in malloc(0x3ffc01b8400, 0x3ff805ae6d0, 0xfffffffffffffffc, 0x40100000, 0x222384c, 0x20001e2fc40) in /usr/shlib/libc.so
+#9 0x3ff81f300e8 in operator new(0x3ffc01b8400, 0x3ff805ae6d0, 0xfffffffffffffffc, 0x40100000, 0x222384c, 0x20001e2fc40) in /usr/lib/cmplrs/cxx/libcxx.so
+#10 0x12000a53c in filter(0x3ffc01b8400, 0x3ff805ae6d0, 0xfffffffffffffffc, 0x40100000, 0x222384c, 0x20001e2fc40) in searchGENOME
+#11 0x120008a7c in doSearch(0x3ffc01b8400, 0x3ff805ae6d0, 0xfffffffffffffffc, 0x40100000, 0x222384c, 0x20001e2fc40) in searchGENOME
+#12 0x120008e2c in searchThread(0x3ffc01b8400, 0x3ff805ae6d0, 0xfffffffffffffffc, 0x40100000, 0x222384c, 0x20001e2fc40) in searchGENOME
+#13 0x3ff805bd2c8 in __thdBase(0x3ffc01b8400, 0x3ff805ae6d0, 0xfffffffffffffffc, 0x40100000, 0x222384c, 0x20001e2fc40) in /usr/shlib/libpthread.so
+(ladebug) thread 5
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+> 5 <anonymous> blocked mut 9 SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff805caf3c in __hstTransferRegisters(0x3ffc01b8400, 0x3ff805a1ce4, 0x3ff805a3608, 0x20000a0f600, 0x140028000, 0x20000a0f600) in /usr/shlib/libpthre
+ad.so
+#1 0x3ff805af74c in __osTransferContext(0x3ffc01b8400, 0x3ff805a1ce4, 0x3ff805a3608, 0x20000a0f600, 0x140028000, 0x20000a0f600) in /usr/shlib/libpthread.
+so
+#2 0x3ff805a3c50 in __dspTransferContext(0x3ffc01b8400, 0x3ff805a1ce4, 0x3ff805a3608, 0x20000a0f600, 0x140028000, 0x20000a0f600) in /usr/shlib/libpthread
+.so
+#3 0x3ff805a12f4 in __dspDispatch(0x3ffc01b8400, 0x3ff805a1ce4, 0x3ff805a3608, 0x20000a0f600, 0x140028000, 0x20000a0f600) in /usr/shlib/libpthread.so
+#4 0x3ff805ab90c in UnknownProcedure32FromFile8(0x3ffc01b8400, 0x3ff805a1ce4, 0x3ff805a3608, 0x20000a0f600, 0x140028000, 0x20000a0f600) in /usr/shlib/lib
+pthread.so
+#5 0x3ff805ab2f4 in UnknownProcedure31FromFile8(0x3ffc01b8400, 0x3ff805a1ce4, 0x3ff805a3608, 0x20000a0f600, 0x140028000, 0x20000a0f600) in /usr/shlib/lib
+pthread.so
+#6 0x3ff805abe3c in UnknownProcedure34FromFile8(0x3ffc01b8400, 0x3ff805a1ce4, 0x3ff805a3608, 0x20000a0f600, 0x140028000, 0x20000a0f600) in /usr/shlib/lib
+pthread.so
+#7 0x3ff801be4f0 in UnknownProcedure12FromFile22(0x3ffc01b8400, 0x3ff805a1ce4, 0x140a275c0, 0x100000, 0x3700498c55, 0x20000a0f600) in /usr/shlib/libc.so
+#8 0x3ff800cf2b0 in free(0x3ffc01b8400, 0x3ff805a1ce4, 0x140a275c0, 0x100000, 0x3700498c55, 0x20000a0f600) in /usr/shlib/libc.so
+#9 0x3ff81f15a7c in operator delete(0x3ffc01b8400, 0x3ff805a1ce4, 0x140a275c0, 0x100000, 0x3700498c55, 0x20000a0f600) in /usr/lib/cmplrs/cxx/libcxx.so
+#10 0x12000b090 in filter(0x3ffc01b8400, 0x3ff805a1ce4, 0x140a275c0, 0x100000, 0x3700498c55, 0x20000a0f600) in searchGENOME
+#11 0x120008a7c in doSearch(0x3ffc01b8400, 0x3ff805a1ce4, 0x140a275c0, 0x100000, 0x3700498c55, 0x20000a0f600) in searchGENOME
+#12 0x120008dec in searchThread(0x3ffc01b8400, 0x3ff805a1ce4, 0x140a275c0, 0x100000, 0x3700498c55, 0x20000a0f600) in searchGENOME
+#13 0x3ff805bd2c8 in __thdBase(0x3ffc01b8400, 0x3ff805a1ce4, 0x140a275c0, 0x100000, 0x3700498c55, 0x20000a0f600) in /usr/shlib/libpthread.so
+(ladebug) thread 4
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+> 4 <anonymous> blocked mut 9 SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff805caf3c in __hstTransferRegisters(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x2000141fc40) in /usr/shlib/libpthread.so
+#1 0x3ff805af74c in __osTransferContext(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x2000141fc40) in /usr/shlib/libpthread.so
+#2 0x3ff805a3c50 in __dspTransferContext(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x2000141fc40) in /usr/shlib/libpthread.so
+#3 0x3ff805a12f4 in __dspDispatch(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x2000141fc40) in /usr/shlib/libpthread.so
+#4 0x3ff805ab90c in UnknownProcedure32FromFile8(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x2000141fc40) in /usr/shlib/libpthread.so
+#5 0x3ff805ab2f4 in UnknownProcedure31FromFile8(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x2000141fc40) in /usr/shlib/libpthread.so
+#6 0x3ff805abe3c in UnknownProcedure34FromFile8(0x3ffc01b8400, 0x3ff805ae6d0, 0x3ff805a33a4, 0x0, 0x0, 0x2000141fc40) in /usr/shlib/libpthread.so
+#7 0x3ff801bf6a0 in UnknownProcedure2FromFile22(0x3ffc01b8400, 0x3ff805ae6d0, 0x1, 0x40100000, 0x31c, 0x2000141fc40) in /usr/shlib/libc.so
+#8 0x3ff800cdad4 in malloc(0x3ffc01b8400, 0x3ff805ae6d0, 0x1, 0x40100000, 0x31c, 0x2000141fc40) in /usr/shlib/libc.so
+#9 0x3ff81f300e8 in operator new(0x3ffc01b8400, 0x3ff805ae6d0, 0x1, 0x40100000, 0x31c, 0x2000141fc40) in /usr/lib/cmplrs/cxx/libcxx.so
+#10 0x12000a53c in filter(0x3ffc01b8400, 0x3ff805ae6d0, 0x1, 0x40100000, 0x31c, 0x2000141fc40) in searchGENOME
+#11 0x120008a7c in doSearch(0x3ffc01b8400, 0x3ff805ae6d0, 0x1, 0x40100000, 0x31c, 0x2000141fc40) in searchGENOME
+#12 0x120008dec in searchThread(0x3ffc01b8400, 0x3ff805ae6d0, 0x1, 0x40100000, 0x31c, 0x2000141fc40) in searchGENOME
+#13 0x3ff805bd2c8 in __thdBase(0x3ffc01b8400, 0x3ff805ae6d0, 0x1, 0x40100000, 0x31c, 0x2000141fc40) in /usr/shlib/libpthread.so
+(ladebug) thread 3
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+> 3 <anonymous> blocked mut 9 SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff805caf3c in __hstTransferRegisters(0x3ffc01b8400, 0x13, 0x3ff805a33a4, 0x0, 0x0, 0x2000283fc40) in /usr/shlib/libpthread.so
+#1 0x3ff805af74c in __osTransferContext(0x3ffc01b8400, 0x13, 0x3ff805a33a4, 0x0, 0x0, 0x2000283fc40) in /usr/shlib/libpthread.so
+#2 0x3ff805a3c50 in __dspTransferContext(0x3ffc01b8400, 0x13, 0x3ff805a33a4, 0x0, 0x0, 0x2000283fc40) in /usr/shlib/libpthread.so
+#3 0x3ff805a12f4 in __dspDispatch(0x3ffc01b8400, 0x13, 0x3ff805a33a4, 0x0, 0x0, 0x2000283fc40) in /usr/shlib/libpthread.so
+#4 0x3ff805ab90c in UnknownProcedure32FromFile8(0x3ffc01b8400, 0x13, 0x3ff805a33a4, 0x0, 0x0, 0x2000283fc40) in /usr/shlib/libpthread.so
+#5 0x3ff805ab2f4 in UnknownProcedure31FromFile8(0x3ffc01b8400, 0x13, 0x3ff805a33a4, 0x0, 0x0, 0x2000283fc40) in /usr/shlib/libpthread.so
+#6 0x3ff805abe3c in UnknownProcedure34FromFile8(0x3ffc01b8400, 0x13, 0x3ff805a33a4, 0x0, 0x0, 0x2000283fc40) in /usr/shlib/libpthread.so
+#7 0x3ff801bf6a0 in UnknownProcedure2FromFile22(0x3ffc01b8400, 0x13, 0x20001927600, 0x40100000, 0x3ff81f34150, 0x2000283fc40) in /usr/shlib/libc.so
+#8 0x3ff800cdad4 in malloc(0x3ffc01b8400, 0x13, 0x20001927600, 0x40100000, 0x3ff81f34150, 0x2000283fc40) in /usr/shlib/libc.so
+#9 0x3ff81f32050 in UnknownProcedure3FromFile46(0x3ffc01b8400, 0x13, 0x20001927600, 0x40100000, 0x3ff81f34150, 0x2000283fc40) in /usr/lib/cmplrs/cxx/libc
+xx.so
+#10 0x3ff81f34190 in __cxx_v60_dispatch__X4need3new8libcxxso(0x3ffc01b8400, 0x13, 0x20001927600, 0x40100000, 0x3ff81f34150, 0x2000283fc40) in /usr/lib/cmp
+lrs/cxx/libcxx.so
+#11 0x3ff807f29d4 in UnknownProcedure11FromFile0(0x3ffc01b8400, 0x13, 0x20001927600, 0x40100000, 0x3ff81f34150, 0x2000283fc40) in /usr/shlib/libexc.so
+#12 0x3ff807f2cd8 in exc_dispatch_exception(0x3ffc01b8400, 0x13, 0x20001927600, 0x40100000, 0x3ff81f34150, 0x2000283fc40) in /usr/shlib/libexc.so
+#13 0x3ff807f39e0 in exc_raise_signal_exception(0x3ffc01b8400, 0x13, 0x20001927600, 0x40100000, 0x3ff81f34150, 0x2000283fc40) in /usr/shlib/libexc.so
+#14 0x3ff805b9470 in UnknownProcedure8FromFile16(0x3ffc01b8400, 0x13, 0x20001927600, 0x40100000, 0x3ff81f34150, 0x2000283fc40) in /usr/shlib/libpthread.so
+#15 0x3ff800d0b9c in __sigtramp(0x3ffc01b8400, 0x13, 0x20001927600, 0x40100000, 0x3ff81f34150, 0x2000283fc40) in /usr/shlib/libc.so
+#16 0x3ff801be2c0 in UnknownProcedure12FromFile22(0x3ffc0086f90, 0x0, 0x100000, 0x0, 0x3f003d2689, 0xa8003d2689) in /usr/shlib/libc.so
+#17 0x3ff800cf2b0 in free(0x3ffc0086f90, 0x0, 0x100000, 0x0, 0x3f003d2689, 0xa8003d2689) in /usr/shlib/libc.so
+#18 0x3ff81f15a7c in operator delete(0x3ffc0086f90, 0x0, 0x100000, 0x0, 0x3f003d2689, 0xa8003d2689) in /usr/lib/cmplrs/cxx/libcxx.so
+#19 0x12000b090 in filter(0x3ffc0086f90, 0x0, 0x100000, 0x0, 0x3f003d2689, 0xa8003d2689) in searchGENOME
+#20 0x120008a7c in doSearch(0x3ffc0086f90, 0x0, 0x100000, 0x0, 0x3f003d2689, 0xa8003d2689) in searchGENOME
+#21 0x120008dec in searchThread(0x3ffc0086f90, 0x0, 0x100000, 0x0, 0x3f003d2689, 0xa8003d2689) in searchGENOME
+#22 0x3ff805bd2c8 in __thdBase(0x3ffc0086f90, 0x0, 0x100000, 0x0, 0x3f003d2689, 0xa8003d2689) in /usr/shlib/libpthread.so
+(ladebug) thread 2
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+> 2 <anonymous> blocked kern usleep SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff800e5c38 in __usleep_thread(0x20000f15a40, 0x0, 0x0, 0x0, 0x0, 0x356) in /usr/shlib/libc.so
+#1 0x3ff801b3314 in __usleep(0x20000f15a40, 0x0, 0x0, 0x0, 0x0, 0x356) in /usr/shlib/libc.so
+#2 0x1200091ac in loaderThread(0x20000f15a40, 0x0, 0x0, 0x0, 0x0, 0x356) in searchGENOME
+#3 0x3ff805bd2c8 in __thdBase(0x20000f15a40, 0x0, 0x0, 0x0, 0x0, 0x356) in /usr/shlib/libpthread.so
+(ladebug) thread 1
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+>* 1 default thread blocked kern usleep SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff800e5c38 in __usleep_thread(0x11fffbe88, 0x0, 0x5254, 0x0, 0x0, 0x140002408) in /usr/shlib/libc.so
+#1 0x3ff801b3314 in __usleep(0x11fffbe88, 0x0, 0x5254, 0x0, 0x0, 0x140002408) in /usr/shlib/libc.so
+#2 0x12000661c in main(0x11fffbe88, 0x0, 0x5254, 0x0, 0x0, 0x140002408) in searchGENOME
+#3 0x1200055c8 in __start(0x11fffbe88, 0x0, 0x5254, 0x0, 0x0, 0x140002408) in searchGENOME
+
+//
+//
+// Stack trace #2
+//
+//
+
+(ladebug) show thread
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+>* 4 <anonymous> blocked kern usleep SCHED_OTHER 19
+ 1 default thread blocked mut 15 SCHED_OTHER 19
+ -1 manager thread blk SCS SCHED_RR 19
+ -2 null thread for VP 2 running VP 2 null thread -1
+ 2 <anonymous> blocked mut 15 SCHED_OTHER 19
+ -3 null thread for VP 3 running VP 3 null thread -1
+ 3 <anonymous> blocked mut 15 SCHED_OTHER 19
+ -4 null thread for VP 4 running VP 4 null thread -1
+ 5 <anonymous> blocked mut 15 SCHED_OTHER 19
+ -5 null thread for VP 5 running VP 5 null thread -1
+ 6 <anonymous> blocked mut 15 SCHED_OTHER 19
+ 7 <anonymous> blocked mut 15 SCHED_OTHER 19
+ 8 <anonymous> blocked mut 15 SCHED_OTHER 19
+
+
+
+(ladebug) show mutex
+Mutex Name State Owner Pri Type Waiters (+Count)
+------ ------------------------- ----- ------ --- -------- --------------------
+ 1 Once Normal
+ 2 debugger client registry Normal
+ 3 VM stats Normal
+ 4 key creation Normal
+ 5 malloc heap Normal
+ 6 malloc hash Normal
+ 7 malloc cache[0] Normal
+ 8 malloc cache[1] Normal
+ 9 malloc cache[2] Normal
+ 10 malloc cache[3] Normal
+ 11 malloc cache[4] Normal
+ 12 malloc cache[5] Normal
+ 13 malloc cache[6] Normal
+ 14 malloc cache[7] Normal
+ 15 malloc cache[8] Lock Normal 6, 7, 8, 1, 5, 2, 3
+ 16 malloc cache[9] Normal
+ 17 malloc cache[10] Normal
+ 18 malloc cache[11] Normal
+ 19 malloc cache[12] Normal
+ 20 malloc cache[13] Normal
+ 21 malloc cache[14] Normal
+ 22 malloc cache[15] Normal
+ 23 malloc cache[16] Normal
+ 24 malloc cache[17] Normal
+ 25 malloc cache[18] Normal
+ 26 malloc cache[19] Normal
+ 27 malloc cache[20] Normal
+ 28 malloc cache[21] Normal
+ 29 malloc cache[22] Normal
+ 30 malloc cache[23] Normal
+ 31 malloc cache[24] Normal
+ 32 malloc cache[25] Normal
+ 33 malloc cache[26] Normal
+ 34 malloc cache[27] Normal
+ 35 malloc cache[28] Normal
+ 36 brk Normal
+ 37 exc cr Recurs
+ 38 exc read rwl Normal
+ 39 VM 0 lookaside Normal
+ 40 VM 1 lookaside Normal
+ 41 VM 2 lookaside Normal
+ 42 VM 3 lookaside Normal
+ 43 VM 4 lookaside Normal
+ 44 VM 5 lookaside Normal
+ 45 VM 6 lookaside Normal
+ 46 VM 0 cache Normal
+ 47 VM 1 cache Normal
+ 48 VM 2 cache Normal
+ 49 Global lock Recurs
+ 50 ldr Recurs
+ 51 <anonymous> Recurs
+ 52 stderr Recurs
+ 53 stdout Recurs
+ 54 <anonymous> Recurs
+ 55 <anonymous> Recurs
+ 56 inputTailMutex(0x14000105 Normal
+ 57 queryMatchMutex(0x1400010 Normal
+
+
+
+
+
(ladebug) thread 1
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+> 1 default thread blocked mut 15 SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff805ba8ac in __hstTransferRegisters(0x20002d47600, 0x0, 0x0, 0x100000000, 0x20002d47c40, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#1 0x3ff805acf74 in __osTransferContext(0x20002d47600, 0x0, 0x0, 0x100000000, 0x20002d47c40, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#2 0x3ff805a004c in __dspDispatch(0x20002d47600, 0x0, 0x0, 0x100000000, 0x20002d47c40, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#3 0x3ff805a94e4 in UnknownProcedure146FromFile0(0x20002d47600, 0x0, 0x0, 0x100000000, 0x20002d47c40, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#4 0x3ff805a9bf8 in UnknownProcedure148FromFile0(0x20002d47600, 0x0, 0x0, 0x100000000, 0x20002d47c40, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#5 0x3ff801bed30 in UnknownProcedure12FromFile22(0x20002d47600, 0x0, 0x0, 0x0, 0x0, 0x3ff805ac400) in /usr/shlib/libc.so
+#6 0x3ff800cf2c0 in free(0x20002d47600, 0x0, 0x0, 0x0, 0x0, 0x3ff805ac400) in /usr/shlib/libc.so
+#7 0x3ff81f15a7c in operator delete(0x20002d47600, 0x0, 0x0, 0x0, 0x0, 0x3ff805ac400) in /usr/lib/cmplrs/cxx/libcxx.so
+#8 0x3ff81f2f53c in operator delete[](0x20002d47600, 0x0, 0x0, 0x0, 0x0, 0x3ff805ac400) in /usr/lib/cmplrs/cxx/libcxx.so
+#9 0x1200073fc in main(0x20002d47600, 0x0, 0x0, 0x0, 0x0, 0x3ff805ac400) in searchGENOME
+#10 0x120006088 in __start(0x20002d47600, 0x0, 0x0, 0x0, 0x0, 0x3ff805ac400) in searchGENOME
+
+
(ladebug) thread 2
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+> 2 <anonymous> blocked mut 15 SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff805ba8ac in __hstTransferRegisters(0x2000141f600, 0x0, 0x0, 0x100000000, 0x3ff805a0194, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#1 0x3ff805acf74 in __osTransferContext(0x2000141f600, 0x0, 0x0, 0x100000000, 0x3ff805a0194, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#2 0x3ff805a004c in __dspDispatch(0x2000141f600, 0x0, 0x0, 0x100000000, 0x3ff805a0194, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#3 0x3ff805a94e4 in UnknownProcedure146FromFile0(0x2000141f600, 0x0, 0x0, 0x100000000, 0x3ff805a0194, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#4 0x3ff805a9bf8 in UnknownProcedure148FromFile0(0x2000141f600, 0x0, 0x0, 0x100000000, 0x3ff805a0194, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#5 0x3ff801bfee0 in UnknownProcedure2FromFile22(0x2000141f600, 0x0, 0x4, 0x0, 0x0, 0x3ff805ac400) in /usr/shlib/libc.so
+#6 0x3ff800cdae4 in malloc(0x2000141f600, 0x0, 0x4, 0x0, 0x0, 0x3ff805ac400) in /usr/shlib/libc.so
+#7 0x3ff81f300e8 in operator new(0x2000141f600, 0x0, 0x4, 0x0, 0x0, 0x3ff805ac400) in /usr/lib/cmplrs/cxx/libcxx.so
+#8 0x3ff81f2f5dc in operator new[](0x2000141f600, 0x0, 0x4, 0x0, 0x0, 0x3ff805ac400) in /usr/lib/cmplrs/cxx/libcxx.so
+#9 0x12000a9bc in deadlockDetector(0x2000141f600, 0x0, 0x4, 0x0, 0x0, 0x3ff805ac400) in searchGENOME
+#10 0x3ff805c67e0 in __thdBase(0x2000141f600, 0x0, 0x4, 0x0, 0x0, 0x3ff805ac400) in /usr/shlib/libpthread.so
+
+
(ladebug) thread 3
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+> 3 <anonymous> blocked mut 15 SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff805ba8ac in __hstTransferRegisters(0x20001e2f600, 0x0, 0x0, 0x100000000, 0x0, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#1 0x3ff805acf74 in __osTransferContext(0x20001e2f600, 0x0, 0x0, 0x100000000, 0x0, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#2 0x3ff805a004c in __dspDispatch(0x20001e2f600, 0x0, 0x0, 0x100000000, 0x0, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#3 0x3ff805a94e4 in UnknownProcedure146FromFile0(0x20001e2f600, 0x0, 0x0, 0x100000000, 0x0, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#4 0x3ff805a9bf8 in UnknownProcedure148FromFile0(0x20001e2f600, 0x0, 0x0, 0x100000000, 0x0, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#5 0x3ff801bfee0 in UnknownProcedure2FromFile22(0x20001e2f600, 0x0, 0x0, 0x0, 0x3ff80119094, 0x3ff805ac400) in /usr/shlib/libc.so
+#6 0x3ff800cdae4 in malloc(0x20001e2f600, 0x0, 0x0, 0x0, 0x3ff80119094, 0x3ff805ac400) in /usr/shlib/libc.so
+#7 0x3ff805be20c in UnknownProcedure0FromFile99(0x20001e2f600, 0x0, 0x0, 0x0, 0x3ff80119094, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#8 0x3ff805be508 in UnknownProcedure1FromFile99(0x20001e2f600, 0x0, 0x0, 0x0, 0x3ff80119094, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#9 0x3ff805be5d0 in UnknownProcedure3FromFile99(0x20001e2f600, 0x0, 0x0, 0x0, 0x3ff80119094, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#10 0x3ff807f369c in UnknownProcedure15FromFile0(0x20001e2f600, 0x0, 0x0, 0x0, 0x3ff80119094, 0x3ff805ac400) in /usr/shlib/libexc.so
+#11 0x3ff807f3a08 in exc_raise_signal_exception(0x20001e2f600, 0x0, 0x0, 0x0, 0x3ff80119094, 0x3ff805ac400) in /usr/shlib/libexc.so
+#12 0x3ff805b5a9c in UnknownProcedure283FromFile0(0x20001e2f600, 0x0, 0x0, 0x0, 0x3ff80119094, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#13 0x3ff800d0bbc in __sigtramp(0x20001e2f600, 0x0, 0x0, 0x0, 0x3ff80119094, 0x3ff805ac400) in /usr/shlib/libc.so
+#14 0x3ff800e2158 in __kill(0x27aaf6, 0x6, 0x1000000, 0x0, 0x0, 0x1) in /usr/shlib/libc.so
+#15 0x12000aad0 in deadlockChecker(0x27aaf6, 0x6, 0x1000000, 0x0, 0x0, 0x1) in searchGENOME
+#16 0x3ff805c67e0 in __thdBase(0x27aaf6, 0x6, 0x1000000, 0x0, 0x0, 0x1) in /usr/shlib/libpthread.so
+
+
(ladebug) thread 4
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+>* 4 <anonymous> blocked kern usleep SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff800e5e68 in __usleep_thread(0x20002335a30, 0x20002335a28, 0x0, 0x0, 0x12000a690, 0x12000a6c0) in /usr/shlib/libc.so
+#1 0x3ff80ba527c in nanosleep(0x20002335a30, 0x20002335a28, 0x0, 0x0, 0x12000a690, 0x12000a6c0) in /usr/shlib/librt.so
+#2 0x12000a6ec in loaderThread(0x20002335a30, 0x20002335a28, 0x0, 0x0, 0x12000a690, 0x12000a6c0) in searchGENOME
+#3 0x3ff805c67e0 in __thdBase(0x20002335a30, 0x20002335a28, 0x0, 0x0, 0x12000a690, 0x12000a6c0) in /usr/shlib/libpthread.so
+
+
(ladebug) thread 5
+ Thread Name State Substate Policy Pri
+ ------ ------------------------- --------------- ----------- ------------ ---
+> 5 <anonymous> blocked mut 15 SCHED_OTHER 19
+
+(ladebug) where
+>0 0x3ff805ba8ac in __hstTransferRegisters(0x20000a0f600, 0x0, 0x0, 0x100000000, 0x20000a0fc40, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#1 0x3ff805acf74 in __osTransferContext(0x20000a0f600, 0x0, 0x0, 0x100000000, 0x20000a0fc40, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#2 0x3ff805a004c in __dspDispatch(0x20000a0f600, 0x0, 0x0, 0x100000000, 0x20000a0fc40, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#3 0x3ff805a94e4 in UnknownProcedure146FromFile0(0x20000a0f600, 0x0, 0x0, 0x100000000, 0x20000a0fc40, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#4 0x3ff805a9bf8 in UnknownProcedure148FromFile0(0x20000a0f600, 0x0, 0x0, 0x100000000, 0x20000a0fc40, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#5 0x3ff801bfee0 in UnknownProcedure2FromFile22(0x20000a0f600, 0x0, 0x2000283f600, 0x2000283d288, 0x3ff81f34150, 0x3ff805ac400) in /usr/shlib/libc.so
+#6 0x3ff800cdae4 in malloc(0x20000a0f600, 0x0, 0x2000283f600, 0x2000283d288, 0x3ff81f34150, 0x3ff805ac400) in /usr/shlib/libc.so
+#7 0x3ff81f32050 in UnknownProcedure3FromFile46(0x20000a0f600, 0x0, 0x2000283f600, 0x2000283d288, 0x3ff81f34150, 0x3ff805ac400) in /usr/lib/cmplrs/cxx/libcxx.so
+#8 0x3ff81f34190 in __cxx_v60_dispatch__X4need3new8libcxxso(0x20000a0f600, 0x0, 0x2000283f600, 0x2000283d288, 0x3ff81f34150, 0x3ff805ac400) in /usr/lib/cmplrs/cxx/libcxx.so
+#9 0x3ff807f29d4 in UnknownProcedure11FromFile0(0x20000a0f600, 0x0, 0x2000283f600, 0x2000283d288, 0x3ff81f34150, 0x3ff805ac400) in /usr/shlib/libexc.so
+#10 0x3ff807f2cd8 in exc_dispatch_exception(0x20000a0f600, 0x0, 0x2000283f600, 0x2000283d288, 0x3ff81f34150, 0x3ff805ac400) in /usr/shlib/libexc.so
+#11 0x3ff807f39e0 in exc_raise_signal_exception(0x20000a0f600, 0x0, 0x2000283f600, 0x2000283d288, 0x3ff81f34150, 0x3ff805ac400) in /usr/shlib/libexc.so
+#12 0x3ff805b5a9c in UnknownProcedure283FromFile0(0x20000a0f600, 0x0, 0x2000283f600, 0x2000283d288, 0x3ff81f34150, 0x3ff805ac400) in /usr/shlib/libpthread.so
+#13 0x3ff800d0bbc in __sigtramp(0x20000a0f600, 0x0, 0x2000283f600, 0x2000283d288, 0x3ff81f34150, 0x3ff805ac400) in /usr/shlib/libc.so
+#14 0x3ff801beb00 in UnknownProcedure12FromFile22(0x3ffc0087290, 0x0, 0x100000, 0x0, 0x3ff801bead0, 0x3ff801beba4) in /usr/shlib/libc.so
+#15 0x3ff800cf2c0 in free(0x3ffc0087290, 0x0, 0x100000, 0x0, 0x3ff801bead0, 0x3ff801beba4) in /usr/shlib/libc.so
+#16 0x3ff81f15a7c in operator delete(0x3ffc0087290, 0x0, 0x100000, 0x0, 0x3ff801bead0, 0x3ff801beba4) in /usr/lib/cmplrs/cxx/libcxx.so
+#17 0x120009680 in ~encodedQuery(0x3ffc0087290, 0x0, 0x100000, 0x0, 0x3ff801bead0, 0x3ff801beba4) in searchGENOME
+#18 0x120009f9c in doSearch(0x3ffc0087290, 0x0, 0x100000, 0x0, 0x3ff801bead0, 0x3ff801beba4) in searchGENOME
+#19 0x12000a39c in searchThread(0x3ffc0087290, 0x0, 0x100000, 0x0, 0x3ff801bead0, 0x3ff801beba4) in searchGENOME
+#20 0x3ff805c67e0 in __thdBase(0x3ffc0087290, 0x0, 0x100000, 0x0, 0x3ff801bead0, 0x3ff801beba4) in /usr/shlib/libpthread.so
+
+#endif // DONT_EVER_ENABLE_THIS
diff --git a/seagen/thr-loader.C b/seagen/thr-loader.C
new file mode 100644
index 0000000..cc8f50a
--- /dev/null
+++ b/seagen/thr-loader.C
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <new>
+
+#include "searchGENOME.H"
+
+void*
+loaderThread(void *) {
+ encodedQuery *Q = 0L;
+ seqInCore *B = 0L;
+
+ try {
+ B = config._qsFASTA->getSequenceInCore();
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "loaderThread()-- Failed to load next query sequence\ncaught bad_alloc in %s at line %d\n", __FILE__, __LINE__);
+ exit(1);
+ }
+
+ if (B) {
+ Q = new encodedQuery(B, config._merSize);
+ delete B;
+ }
+
+ return(Q);
+}
diff --git a/seagen/thr-output.C b/seagen/thr-output.C
new file mode 100644
index 0000000..6f7088d
--- /dev/null
+++ b/seagen/thr-output.C
@@ -0,0 +1,71 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <new>
+
+#include "searchGENOME.H"
+
+#if 0
+void
+statusThread(void *) {
+ double finish = 0.0;
+
+ if (config._outputPos > 0)
+ finish = (config._numberOfQueries - config._outputPos) / (config._outputPos / (getTime() - config._zeroTime));
+
+ fprintf(stderr, "O:"uint32FMTW(7)" S:"uint32FMTW(7)" I:"uint32FMTW(7)" T:"uint32FMTW(7)" (%5.1f%%; %8.3f/sec) Finish in %5.2f seconds.\r",
+ outputPos,
+ inputTail,
+ inputHead,
+ numberOfQueries,
+ 100.0 * outputPos / numberOfQueries,
+ outputPos / (getTime() - zeroTime),
+ finish);
+ fflush(stderr);
+
+ double perSec = outputPos / (getTime() - zeroTime + 0.0000001);
+
+ if (perSec < 32.0)
+ outputMask = 0xf;
+ else if (perSec < 256.0)
+ outputMask = 0x7f;
+ else if (perSec < 1024.0)
+ outputMask = 0x1ff;
+ else
+ outputMask = 0x3ff;
+}
+#endif
+
+
+
+void*
+writerThread(void *U, void *Q) {
+ encodedQuery *query = (encodedQuery *)Q;
+
+ // Write the hits
+ //
+ if (query->theOutputLength() > 0) {
+ errno = 0;
+ write(config._outputFile, query->theOutput(), query->theOutputLength());
+ if (errno)
+ fprintf(stderr, "Couldn't write to the output file '%s'.\n%s\n",
+ config._outputFileName, strerror(errno)), exit(1);
+ }
+
+ // Write the query match counts, too!
+ //
+ if (config._matchCountsFile) {
+ char str[256];
+
+ sprintf(str, uint32FMT"\n", query->numberOfResults());
+
+ errno = 0;
+ write(config._matchCountsFile, str, strlen(str));
+ if (errno)
+ fprintf(stderr, "Couldn't write to the match counts file '%s'.\n%s\n",
+ config._queryMatchFileName, strerror(errno)), exit(1);
+ }
+
+ delete query;
+
+ return(0L);
+}
diff --git a/seagen/thr-search.C b/seagen/thr-search.C
new file mode 100644
index 0000000..3ac29c6
--- /dev/null
+++ b/seagen/thr-search.C
@@ -0,0 +1,76 @@
+#include "searchGENOME.H"
+#include "encodedQuery.H"
+
+// If you really, really, really want to know the exact number
+// of bases left in the query, use the interval list. Otherwise,
+// it's faster to guess.
+//
+//#define USEEXACTSIZE
+
+void
+doSearch(searcherState *state,
+ encodedQuery *query,
+ bool isReverse) {
+
+ // Get the hits
+ double startTime = getTime();
+ uint64 count = 0;
+
+ hitMatrix *matrix = new hitMatrix(query->bpTotal(),
+ query->bpCovered(false),
+ query->IID());
+
+ for (uint32 qi=0; qi<query->numberOfMers(); qi++)
+ if ((query->getSkip(qi, isReverse) == false) &&
+ (config._positions->getExact(query->getMer(qi, isReverse),
+ state->posn,
+ state->posnMax,
+ state->posnLen,
+ count)))
+ matrix->addHits(qi, state->posn, state->posnLen);
+
+ state->searchTime += getTime() - startTime;
+
+
+ // Filter, storing the resutls into theOutput
+ startTime = getTime();
+
+ matrix->filter(query, isReverse);
+ delete matrix;
+
+ state->filterTime += getTime() - startTime;
+}
+
+
+
+void
+searchThread(void *U, void *T, void *Q) {
+ searcherState *state = (searcherState *)T;
+ encodedQuery *query = (encodedQuery *)Q;
+
+ // Finish building the query -- mask out repetitive junk
+ //
+ double startTime = getTime();
+
+ if (config._maskDB)
+ for (uint32 qi=0; qi<query->numberOfMers(); qi++)
+ if ((query->getSkip(qi, false) == false) &&
+ (config._maskDB->exists(query->getMer(qi, false))))
+ query->setSkip(qi, false);
+
+ if (config._onlyDB)
+ for (uint32 qi=0; qi<query->numberOfMers(); qi++)
+ if ((query->getSkip(qi, false) == false) &&
+ (!config._onlyDB->exists(query->getMer(qi, false))))
+ query->setSkip(qi, false);
+
+ state->maskTime += getTime() - startTime;
+
+
+ // Do searches.
+ //
+ if (config._doForward)
+ doSearch(state, query, false);
+ if (config._doReverse)
+ doSearch(state, query, true);
+}
diff --git a/seatac/Make.include b/seatac/Make.include
new file mode 100644
index 0000000..757a0da
--- /dev/null
+++ b/seatac/Make.include
@@ -0,0 +1,41 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBMERYL/ :=$(realpath $/../libmeryl/)/
+LIBKMER/ :=$(realpath $/../libkmer/)/
+
+src := $/seatac.C \
+ $/configuration.C \
+ $/encodedQuery.C \
+ $/hitMatrix.C \
+ $/thr-search.C \
+ $/thr-loader.C \
+ $/thr-deadlock.C \
+ $/hitMatrix-sort.C \
+ $/hitMatrix.H \
+ $/posix.H \
+ $/seatac.H \
+ $/filterObj.H \
+ $/statObj.H
+
+$/.CXX_SRCS := $(filter %.C,${src})
+$/.CXX_EXES := $/seatac $/heavychains
+
+$/.CXX_SHLIBS := $/filter-nop.so $/filter-heavychains.so
+
+$/filter-nop.o: $/filterObj.H $/statObj.H $/filter-nop.C
+$/filter-heavychains.o: $/filterObj.H $/statObj.H $/filter-heavychains.C $/heavychains.C $/heavychains.H
+
+$/filter-nop.so: $/filter-nop.o
+$/filter-heavychains.so: $/filter-heavychains.o $/heavychains.o
+
+$/.CLEAN :=$/*.o
+
+$/%.d $/%.o: CXXFLAGS+=-I${LIBKMER/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/}
+
+$/seatac: ${$/.CXX_SRCS:.C=.o} \
+ ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$/heavychains: $/heavychains-driver.o $/heavychains.o
diff --git a/seatac/configuration.C b/seatac/configuration.C
new file mode 100644
index 0000000..aa45de1
--- /dev/null
+++ b/seatac/configuration.C
@@ -0,0 +1,285 @@
+#include "seatac.H"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "sharedObj.H"
+
+configuration::configuration(void) {
+
+ _beVerbose = false;
+
+ _merSize = 20;
+ _merSkip = 0;
+ _numSearchThreads = 4;
+
+ _doReverse = true;
+ _doForward = true;
+
+ _maxDiagonal = 25;
+ _maxGap = 0;
+ _qsOverlap = 15;
+ _dsOverlap = 15;
+
+ _minLength = 20;
+
+ _dbFileName = 0L;
+ _qsFileName = 0L;
+ _maskFileName = 0L;
+ _onlyFileName = 0L;
+ _outputFileName = 0L;
+ _statsFileName = 0L;
+
+ _tableFileName = 0L;
+ _tableBuildOnly = false;
+
+ _filtername = 0L;
+ _filteropts = 0L;
+ _filterObj = 0L;
+
+ _startTime = 0.0;
+ _initTime = 0.0;
+ _buildTime = 0.0;
+ _searchTime = 0.0;
+ _totalTime = 0.0;
+
+ _loaderHighWaterMark = 2;
+ _loaderSleep.tv_sec = 1;
+ _loaderSleep.tv_nsec = 0;
+ _loaderWarnings = false;
+
+ _searchSleep.tv_sec = 0;
+ _searchSleep.tv_nsec = 10000000;
+
+ _writerHighWaterMark = 256;
+ _writerSleep.tv_sec = 1;
+ _writerSleep.tv_nsec = 0;
+ _writerWarnings = false;
+}
+
+configuration::~configuration() {
+}
+
+static char const *usageString =
+"usage: %s [options]\n"
+"\n"
+"Algorithm Options:\n"
+" -mersize k Use k-mers\n"
+" -merskip j Skip j mers between each mer inserted into table\n"
+" -forward Search only the normal query sequences\n"
+" -reverse Search only the reverse-complemented query sequences\n"
+" -maxdiagonal d\n"
+" -maxgap g\n"
+" -qoverlap q\n"
+" -doverlap d\n"
+" -minelength l\n"
+"\n"
+"Process Options\n"
+" -numthreads n Use n search threads\n"
+" -loaderhighwatermark h Size of the loader queue\n"
+" -loadersleep t Time the loader will sleep when its output queue is full\n"
+" -loaderwarnings Enable warning messages for the loader\n"
+" -searchsleep t Time the searcher will sleep when it has no input\n"
+" -writerhighwatermark h Size of the output queue\n"
+" -writersleep t Time the writer will sleep when it has nothing to write\n"
+" -writerwarnings Enable warning messages for the writer\n"
+"\n"
+" -usetables datfile If 'datfile' exists AND is a complete and valid file,\n"
+" load the tables from the file and do the compute.\n"
+" Otherwise, fail.\n"
+"\n"
+" -buildtables datfile If 'datfile' doesn't exist, build the tables, write\n"
+" them to 'datfile' and exit. Otherwise, quit.\n"
+"\n"
+"Filtering Options\n"
+" -filtername x.so Use the shared object x.so as a filter method.\n"
+" -filteropts opts The string 'opts' is passed to the filter on creation.\n"
+"\n"
+"Input Options:\n"
+" -mask f Ignore all mers listed in file f\n"
+" -only f Use only the mers listed in file f\n"
+" -stream s.fasta Query sequences (the stream)\n"
+" -table t.fasta Database sequences (the table)\n"
+" -use #,#,#,# using only those sequences specified\n"
+" -use file using only those sequences listed in the file\n"
+"\n"
+"Output Options\n"
+" -verbose Entertain the user\n"
+" -output f Write output to file f\n"
+" -stats f Write resource statistics to f\n";
+
+
+
+void
+configuration::usage(char *name) {
+ fprintf(stderr, usageString, name);
+}
+
+
+
+void
+configuration::read(int argc, char **argv) {
+ int fail = 0;
+ int arg = 1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-mersize") == 0) {
+ arg++;
+ _merSize = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-merskip") == 0) {
+ arg++;
+ _merSkip = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-numthreads") == 0) {
+ arg++;
+ _numSearchThreads = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-mask") == 0) {
+ arg++;
+ _maskFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-only") == 0) {
+ arg++;
+ _onlyFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-usetables") == 0) {
+ arg++;
+ _tableFileName = argv[arg];
+ _tableBuildOnly = false;
+ } else if (strcmp(argv[arg], "-buildtables") == 0) {
+ arg++;
+ _tableFileName = argv[arg];
+ _tableBuildOnly = true;
+ } else if (strcmp(argv[arg], "-stream") == 0) {
+ arg++;
+ _qsFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-table") == 0) {
+ arg++;
+ _dbFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-use") == 0) {
+ arg++;
+ fprintf(stderr, "%s: -use not supported anymore.\n", argv[0]);
+ exit(1);
+ } else if (strcmp(argv[arg], "-forward") == 0) {
+ _doForward = true;
+ _doReverse = false;
+ } else if (strcmp(argv[arg], "-reverse") == 0) {
+ _doReverse = true;
+ _doForward = false;
+ } else if (strcmp(argv[arg], "-verbose") == 0) {
+ _beVerbose = true;
+ } else if (strcmp(argv[arg], "-output") == 0) {
+ arg++;
+ _outputFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-stats") == 0) {
+ arg++;
+ _statsFileName = argv[arg];
+ } else if (strcmp(argv[arg], "-maxdiagonal") == 0) {
+ arg++;
+ _maxDiagonal = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-maxgap") == 0) {
+ arg++;
+ _maxGap = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-qoverlap") == 0) {
+ arg++;
+ _qsOverlap = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-doverlap") == 0) {
+ arg++;
+ _dsOverlap = atoi(argv[arg]);
+ } else if (strcmp(argv[arg], "-minlength") == 0) {
+ arg++;
+ _minLength = atoi(argv[arg]);
+ } else if (strncmp(argv[arg], "-loaderhighwatermark", 8) == 0) {
+ _loaderHighWaterMark = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-loadersleep", 8) == 0) {
+ setTime(&_loaderSleep, atof(argv[++arg]));
+ } else if (strncmp(argv[arg], "-loaderwarnings", 8) == 0) {
+ _loaderWarnings = true;
+ } else if (strncmp(argv[arg], "-searchsleep", 8) == 0) {
+ setTime(&_searchSleep, atof(argv[++arg]));
+ } else if (strncmp(argv[arg], "-writerhighwatermark", 8) == 0) {
+ _writerHighWaterMark = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-writersleep", 8) == 0) {
+ setTime(&_writerSleep, atof(argv[++arg]));
+ } else if (strncmp(argv[arg], "-writerwarnings", 8) == 0) {
+ _writerWarnings = true;
+ } else if (strcmp(argv[arg], "-filtername") == 0) {
+ arg++;
+ _filtername = argv[arg];
+ _filterObj = new sharedObj(argv[arg]);
+ } else if (strcmp(argv[arg], "-filteropts") == 0) {
+ arg++;
+ _filteropts = argv[arg];
+ } else {
+ fprintf(stderr, "ERROR: Unknown option '%s'\n", argv[arg]);
+ fail++;
+ }
+ arg++;
+ }
+
+ if (fail)
+ exit(1);
+
+ //
+ // Make sure some constraints are met
+ //
+
+ if (_numSearchThreads > MAX_THREADS) {
+ fprintf(stderr, "ERROR: Threads are limited to %d.\n", MAX_THREADS);
+ exit(1);
+ }
+
+ if (_maskFileName && _onlyFileName) {
+ fprintf(stderr, "ERROR: At most one of -mask and -only may be used.\n");
+ exit(1);
+ }
+
+ //
+ // Check that the mers are at least adjacent
+ //
+ if (_merSkip >= _merSize) {
+ fprintf(stderr, "ERROR: Mers are not adjacent; make sure merskip <= mersize.\n");
+ exit(1);
+ }
+
+ //
+ // Test that we can build filter and stat objects
+ //
+ if (_filtername) {
+ filterObj *testf = new filterObj(_filterObj, _filteropts);
+ delete testf;
+
+ statObj *tests = new statObj(_filterObj, _filteropts);
+ delete tests;
+ }
+}
+
+
+void
+configuration::writeATACheader(FILE *out) {
+ fprintf(out, "! format atac 1.0\n");
+ fprintf(out, "/seatacBeVerbose=%s\n", _beVerbose ? "enabled" : "disabled");
+ fprintf(out, "/seatacNumSearchThreads="uint32FMT"\n", _numSearchThreads);
+ fprintf(out, "/seatacLoaderHighWaterMark="uint32FMT"\n", _loaderHighWaterMark);
+ fprintf(out, "/seatacLoaderSleep=%f\n", (double)_loaderSleep.tv_sec + (double)_loaderSleep.tv_nsec * 1e-9);
+ fprintf(out, "/seatacLoaderWarnings=%s\n", _loaderWarnings ? "true" : "false");
+ fprintf(out, "/seatacSearchSleep=%f\n", (double)_searchSleep.tv_sec + (double)_searchSleep.tv_nsec * 1e-9);
+ fprintf(out, "/seatacWriterHighWaterMark="uint32FMT"\n", _writerHighWaterMark);
+ fprintf(out, "/seatacWriterSleep=%f\n", (double)_writerSleep.tv_sec + (double)_writerSleep.tv_nsec * 1e-9);
+ fprintf(out, "/seatacWriterWarnings=%s\n", _writerWarnings ? "true" : "false");
+ fprintf(out, "/seatacMaxDiagonal="uint32FMT"\n", _maxDiagonal);
+ fprintf(out, "/seatacMaxGap="uint32FMT"\n", _maxGap);
+ fprintf(out, "/seatacQsOverlap="uint32FMT"\n", _qsOverlap);
+ fprintf(out, "/seatacDsOverlap="uint32FMT"\n", _dsOverlap);
+ fprintf(out, "/seatacMinLength="uint32FMT"\n", _minLength + _merSize);
+ fprintf(out, "/seatacMerSize="uint32FMT"\n", _merSize);
+ fprintf(out, "/seatacMerSkip="uint32FMT"\n", _merSkip);
+ fprintf(out, "/seatacDoReverse=%s\n", (_doReverse) ? "true" : "false");
+ fprintf(out, "/seatacDoForward=%s\n", (_doForward) ? "true" : "false");
+ fprintf(out, "/seatacFilterName=%s\n", (_filtername) ? _filtername : "None Specified.");
+ fprintf(out, "/seatacFilterOpts=%s\n", (_filteropts) ? _filteropts : "None Specified.");
+ fprintf(out, "/seatacDbFile=%s\n", (_dbFileName) ? _dbFileName : "None Specified.");
+ fprintf(out, "/seatacQsFile=%s\n", (_qsFileName) ? _qsFileName : "None Specified.");
+ fprintf(out, "/seatacMaskFile=%s\n", (_maskFileName) ? _maskFileName : "None Specified.");
+ fprintf(out, "/seatacOnlyFile=%s\n", (_onlyFileName) ? _onlyFileName : "None Specified.");
+ fprintf(out, "/seatacOutputFile=%s\n", (_outputFileName) ? _outputFileName : "None Specified.");
+ fprintf(out, "/seatacStatsFile=%s\n", (_statsFileName) ? _statsFileName : "None Specified.");
+ fprintf(out, "/seatacTableFile=%s\n", (_tableFileName) ? _tableFileName : "None Specified.");
+}
diff --git a/seatac/encodedQuery.C b/seatac/encodedQuery.C
new file mode 100644
index 0000000..9b095f1
--- /dev/null
+++ b/seatac/encodedQuery.C
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "seatac.H"
+#include "bio++.H"
+
+encodedQuery::encodedQuery(char const *seq,
+ uint32 seqLen,
+ uint32 k,
+ bool rc) {
+ _seq = seq;
+ _seqLen = seqLen;
+ _merSize = k;
+ _rc = rc;
+
+ _seqPos = 0;
+
+ _substring = uint64ZERO;
+ _mermask = uint64MASK(2 * _merSize);
+ _timeUntilValid = _merSize;
+}
+
+
+bool
+encodedQuery::getMer(uint64 &mer, uint32 &pos) {
+ bool found = false;
+
+ mer = uint64ZERO;
+ pos = uint32ZERO;
+
+ if (_rc) {
+
+ while (!found && (_seqPos < _seqLen)) {
+ _substring <<= 2;
+ _substring &= _mermask;
+
+ if (letterToBits[_seq[_seqLen - 1 - _seqPos]] != 0xff) {
+ _substring |= letterToBits[ complementSymbol[ _seq[_seqLen - 1 - _seqPos] ]];
+ _timeUntilValid--;
+ } else {
+ _timeUntilValid = _merSize;
+ }
+
+ _seqPos++;
+
+ if (_seqPos >= _merSize) {
+ mer = _substring;
+ pos = _seqPos - _merSize;
+ found = _timeUntilValid <= 0;
+ }
+ }
+
+ } else {
+
+ while (!found && (_seqPos < _seqLen)) {
+ _substring <<= 2;
+ _substring &= _mermask;
+
+ if (letterToBits[_seq[_seqPos]] != 0xff) {
+ _substring |= letterToBits[_seq[_seqPos]];
+ _timeUntilValid--;
+ } else {
+ _timeUntilValid = _merSize;
+ }
+
+ _seqPos++;
+
+ if (_seqPos >= _merSize) {
+ mer = _substring;
+ pos = _seqPos - _merSize;
+ found = _timeUntilValid <= 0;
+ }
+ }
+
+ }
+
+ return(found);
+}
+
+
+
+
+encodedQuery::~encodedQuery() {
+}
diff --git a/seatac/filter-heavychains.C b/seatac/filter-heavychains.C
new file mode 100644
index 0000000..a39c7d5
--- /dev/null
+++ b/seatac/filter-heavychains.C
@@ -0,0 +1,315 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Copyright (c) 2005 The J. Craig Venter Institute
+// Author: Clark Mobarry
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util++.H"
+#include "heavychains.H"
+
+
+extern "C" {
+ void *construct(char *options);
+ void destruct(void *handle);
+ void addHit(void *handle,
+ char orientation,
+ uint32 id1,
+ uint32 pos1,
+ uint32 len1,
+ uint32 id2,
+ uint32 pos2,
+ uint32 len2,
+ uint32 filled);
+ void filter(void *handle);
+ uint64 output(void *handle, FILE *file, uint64 matchid);
+
+ void *constructStats(char *options);
+ void destructStats(void *handle);
+ void addStats(void *handle, void *sp);
+ void showStats(void *handle, FILE *file);
+}
+
+
+
+// HeavyChains is implemented in the StrandPair class. It takes all
+// hits for a single pair of sequences and....does something. Seatac
+// gives the filterObj interface (aka, the interface in this file)
+// all hits for a single sequence to the whole genome (or part of).
+// So, the StrandPairManager acts as the, uhhh, manager for a bunch
+// of StrandPairs, ensuring that each StrandPair is in fact a pair.
+//
+// It is interface compatible with a StrandPair.
+//
+class StrandPairManager {
+private:
+ int beVerbose;
+ char assemblyId1[32];
+ char assemblyId2[32];
+ int maxJump; // Default maximum intra-run jump allowed in a good run.
+ double minScore; // Default minimum of bp filled in a good run.
+
+ bool isForward;
+
+ StrandPair *P;
+ StrandPair *Proot;
+public:
+ StrandPairManager(bool verbose,
+ char *assemblyid1,
+ char *assemblyid2,
+ int maxjump,
+ double minscore) {
+ beVerbose = verbose;
+ strncpy(assemblyId1, assemblyid1, 31);
+ strncpy(assemblyId2, assemblyid2, 31);
+ maxJump = maxjump;
+ minScore = minscore;
+
+ isForward = true;
+
+ Proot = 0L;
+ P = 0L;
+ };
+
+ ~StrandPairManager(void) {
+ P = Proot;
+ while (Proot) {
+ Proot = Proot->next();
+ delete P;
+ P = Proot;
+ }
+ };
+
+ void addHit(char direction,
+ uint32 id1,
+ uint32 xlo,
+ uint32 xln,
+ uint32 id2,
+ uint32 ylo,
+ uint32 yln,
+ uint32 filled) {
+
+ // We're given hits for exactly one id2 and all id1, forward hits
+ // followed by reverse hits. Which means that id1 makes two
+ // passes through, both passes are increasing (enforced by the
+ // seqStream used in seatac).
+ //
+ // A linked list of strand pairs is kept (the links are built
+ // into StrandPair for convenience), each strand pair knows it's
+ // pair of ids.
+ //
+
+ // No root? Make one and add the hit.
+ //
+ if (Proot == 0L) {
+ P = Proot = new StrandPair(beVerbose, assemblyId1, assemblyId2, maxJump, minScore);
+ P->addHit(direction, id1, xlo, xln, id2, ylo, yln, filled);
+ return;
+ }
+
+ // Reset to the start if we just switched from forward to
+ // reverse. This is also the only time that the sequence id can
+ // decrease, and we might have to make a new root.
+ //
+ if (isForward && (direction == 'r')) {
+ isForward = false;
+
+ if (id1 < Proot->sequenceIID1()) {
+ StrandPair *N = new StrandPair(beVerbose, assemblyId1, assemblyId2, maxJump, minScore);
+ N->addHit(direction, id1, xlo, xln, id2, ylo, yln, filled);
+ N->addNext(Proot);
+ P = Proot = N;
+ return;
+ }
+
+ P = Proot;
+ }
+
+ // Verify that id1 didn't decrease.
+ //
+ if (id1 < P->sequenceIID1()) {
+ fprintf(stderr, "Why did the sequence id just decrease? This should not have happened.\n");
+ fprintf(stderr, "Crash. %s at line %d\n", __FILE__, __LINE__ - 2);
+ exit(1);
+ }
+
+ // Move to the node just before, or exactly at, the one we want
+ // to add to. Remember, id1 never decreases.
+ //
+ while ((P->next()) && (P->next()->sequenceIID1() <= id1))
+ P = P->next();
+
+ // If we're not at the correct node, insert one after the
+ // current, and make it the correct one.
+ //
+ if (P->sequenceIID1() != id1) {
+ StrandPair *NP = new StrandPair(beVerbose, assemblyId1, assemblyId2, maxJump, minScore);
+ NP->addNext(P->next());
+ P->addNext(NP);
+ P = NP; // Hooray!
+ }
+
+ // And now we can just add the hit.
+ //
+ P->addHit(direction, id1, xlo, xln, id2, ylo, yln, filled);
+ };
+
+ void process(void) {
+ for (StrandPair *SP=Proot; SP; SP=SP->next())
+ SP->process();
+ };
+
+ uint64 print(FILE *outF, uint64 matchid) {
+ for (StrandPair *SP=Proot; SP; SP=SP->next())
+ matchid = SP->print(outF, matchid);
+ return(matchid);
+ };
+
+ void addStats(TheStats *ST) {
+ for (StrandPair *SP=Proot; SP; SP=SP->next())
+ ST->add(SP);
+ };
+};
+
+
+
+
+
+
+
+
+void*
+construct(char *options) {
+ int beVerbose = 0;
+ char assemblyIdD[4] = { 'U', 'N', 'K', 0 };
+ char *assemblyId1 = assemblyIdD;
+ char *assemblyId2 = assemblyIdD;
+ double minScore = 100.0; // Default minimum of bp filled in a good run.
+ int maxJump = 100000; // Default maximum intra-run jump allowed in a good run.
+
+ // Parse the options to find the parameters
+ //
+ splitToWords W(options);
+
+ uint32 arg = 0;
+ while (arg < W.numWords()) {
+ if (strcmp(W.getWord(arg), "-v") == 0) {
+ beVerbose++;
+ } else if (strcmp(W.getWord(arg), "-s") == 0) {
+ minScore = atof(W.getWord(++arg));
+ } else if (strcmp(W.getWord(arg), "-j") == 0) {
+ maxJump = atoi(W.getWord(++arg));
+ } else if (strcmp(W.getWord(arg), "-1") == 0) {
+ assemblyId1 = W.getWord(++arg);
+ } else if (strcmp(W.getWord(arg), "-2") == 0) {
+ assemblyId2 = W.getWord(++arg);
+ }
+
+ arg++;
+ }
+
+ return((void *)(new StrandPairManager(beVerbose, assemblyId1, assemblyId2, maxJump, minScore)));
+}
+
+void
+destruct(void *handle) {
+ delete (StrandPairManager *)handle;
+}
+
+void
+addHit(void *handle,
+ char orientation,
+ uint32 id1,
+ uint32 pos1,
+ uint32 len1,
+ uint32 id2,
+ uint32 pos2,
+ uint32 len2,
+ uint32 filled) {
+ ((StrandPairManager *)handle)->addHit(orientation, id1, pos1, len1, id2, pos2, len2, filled);
+}
+
+void
+filter(void *handle) {
+ ((StrandPairManager *)handle)->process();
+}
+
+
+uint64
+output(void *handle, FILE *file, uint64 matchid) {
+ return(((StrandPairManager *)handle)->print(file, matchid));
+}
+
+
+
+
+
+
+void*
+constructStats(char *options) {
+ int beVerbose = 0;
+ char assemblyIdD[4] = { 'U', 'N', 'K', 0 };
+ char *assemblyId1 = assemblyIdD;
+ char *assemblyId2 = assemblyIdD;
+ double minScore = 100.0; // Default minimum of bp filled in a good run.
+ int maxJump = 100000; // Default maximum intra-run jump allowed in a good run.
+
+ // Parse the options to find the parameters
+ //
+ splitToWords W(options);
+
+ uint32 arg = 0;
+ while (arg < W.numWords()) {
+ if (strcmp(W.getWord(arg), "-v") == 0) {
+ beVerbose++;
+ } else if (strcmp(W.getWord(arg), "-s") == 0) {
+ minScore = atof(W.getWord(++arg));
+ } else if (strcmp(W.getWord(arg), "-j") == 0) {
+ maxJump = atoi(W.getWord(++arg));
+ } else if (strcmp(W.getWord(arg), "-1") == 0) {
+ assemblyId1 = W.getWord(++arg);
+ } else if (strcmp(W.getWord(arg), "-2") == 0) {
+ assemblyId2 = W.getWord(++arg);
+ }
+
+ arg++;
+ }
+
+ return((void *)(new TheStats(beVerbose, assemblyId1, assemblyId2, maxJump, minScore)));
+}
+
+void
+destructStats(void *handle) {
+ delete (TheStats *)handle;
+}
+
+void
+addStats(void *handle, void *sp) {
+
+ // We aren't getting a single StrandPair anymore, we're getting a StrandPairManager now.
+ //
+ //((TheStats *)handle)->add((StrandPair *)sp);
+ //
+ ((StrandPairManager *)sp)->addStats((TheStats *)handle);
+}
+
+void
+showStats(void *handle, FILE *file) {
+ ((TheStats *)handle)->show(file);
+}
diff --git a/seatac/filter-nop.C b/seatac/filter-nop.C
new file mode 100644
index 0000000..acd6da5
--- /dev/null
+++ b/seatac/filter-nop.C
@@ -0,0 +1,186 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// A very simple seatac filter. It reports the single longest match for each pair.
+//
+// Also shows how to use a C++ object as a filter. C is pretty much the same thing.
+
+#include "bio.h"
+#include "util++.H"
+
+extern "C" {
+ void *construct(char *options);
+ void destruct(void *handle);
+ void addHit(void *handle,
+ char orientation,
+ uint32 id1,
+ uint32 pos1,
+ uint32 len1,
+ uint32 id2,
+ uint32 pos2,
+ uint32 len2,
+ uint32 filled);
+ void filter(void *handle);
+ uint64 output(void *handle, FILE *file, uint64 matchid);
+
+ void *constructStats(char *options);
+ void destructStats(void *handle);
+ void addStats(void *handle, void *filterhandle);
+ void showStats(void *handle, FILE *file);
+}
+
+
+
+class filterLongest {
+public:
+ filterLongest(char *n1, char *n2) {
+ fprintf(stderr, "Creating a filterLongest\n");
+ strncpy(name1, n1, 31);
+ strncpy(name2, n2, 31);
+ };
+
+ ~filterLongest() {
+ fprintf(stderr, "Destroyed a filterLongest\n");
+ };
+
+ void addHit(char orientation,
+ uint32 id1,
+ uint32 pos1,
+ uint32 len1,
+ uint32 id2,
+ uint32 pos2,
+ uint32 len2,
+ uint32 filled) {
+
+ if (maxfilled < filled) {
+ fprintf(stderr, "filterNOP-- addHit\n");
+
+ maxfilled = filled;
+#if 0
+ sprintf(outstring,
+ "-%c -e "uint32FMT" "uint32FMT" "uint32FMT" -D "uint32FMT" "uint32FMT" "uint32FMT" -F "uint32FMT"\n",
+ orientation, id1, pos1, len1, id2, pos2, len2, filled);
+#endif
+
+ sprintf(outstring,
+ "M x . . %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" %s "uint32FMT"\n",
+ name1, id1, pos1, len1, name2, id2, pos2, len2, (orientation == 'f') ? "1" : "-1", filled);
+ }
+ };
+
+ void filter(void) {
+ fprintf(stderr, "filterNOP-- filter\n");
+ };
+
+ uint64 output(FILE *file, uint64 matchid) {
+ fprintf(stderr, "filterNOP-- output (ignoring matchid)\n");
+ fprintf(file, "%s", outstring);
+ return(matchid);
+ };
+private:
+ char outstring[512];
+ char name1[32], name2[32];
+ uint32 maxfilled;
+};
+
+
+
+class statLongest {
+public:
+ statLongest() {
+ num = 0;
+ }
+ ~statLongest() {
+ }
+
+ void add(filterLongest *F) {
+ num++;
+ }
+
+ void show(FILE *file) {
+ fprintf(file, "/statObjNum=%d\n", num);
+ }
+
+private:
+ int num;
+};
+
+
+
+
+
+
+void*
+construct(char *opts) {
+ char *seq1 = "UNK";
+ char *seq2 = "UNK";
+
+ // Parse the options to find the parameters
+ //
+ splitToWords W(opts);
+
+ uint32 arg = 0;
+ while (arg < W.numWords()) {
+ if (strcmp(W.getWord(arg), "-1") == 0) {
+ seq1 = W.getWord(++arg);
+ } else if (strcmp(W.getWord(arg), "-2") == 0) {
+ seq2 = W.getWord(++arg);
+ }
+
+ arg++;
+ }
+
+ return(new filterLongest(seq1, seq2));
+}
+
+void
+destruct(void *handle) {
+ delete (filterLongest *)handle;
+}
+
+void
+addHit(void *handle,
+ char orientation,
+ uint32 id1,
+ uint32 pos1,
+ uint32 len1,
+ uint32 id2,
+ uint32 pos2,
+ uint32 len2,
+ uint32 filled) {
+ ((filterLongest *)handle)->addHit(orientation, id1, pos1, len1, id2, pos2, len2, filled);
+}
+
+void
+filter(void *handle) {
+ ((filterLongest *)handle)->filter();
+}
+
+uint64
+output(void *handle, FILE *file, uint64 matchid) {
+ return(((filterLongest *)handle)->output(file, matchid));
+}
+
+
+
+
+void*
+constructStats(char *options) {
+ return(new statLongest);
+}
+
+void
+destructStats(void *handle) {
+ delete (statLongest *)handle;
+}
+
+void
+addStats(void *handle, void *filterhandle) {
+ ((statLongest *)handle)->add((filterLongest *)filterhandle);
+}
+
+void
+showStats(void *handle, FILE *file) {
+ ((statLongest *)handle)->show(file);
+}
diff --git a/seatac/filterObj.H b/seatac/filterObj.H
new file mode 100644
index 0000000..f573afd
--- /dev/null
+++ b/seatac/filterObj.H
@@ -0,0 +1,228 @@
+#ifndef FILTEROBJ_H
+#define FILTEROBJ_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sharedObj.H"
+#include "util++.H"
+
+//
+// Object that will filter and output hits. If no sharedObj is
+// supplied, the default behavior is output all hits.
+//
+
+// The default filter here inserts lots of '#'s into the output string,
+// then replaces those with the real match id on output. An alternative
+// (and probably better idea) is to build a vector of structs of things
+// to output.
+
+class filterObj {
+public:
+ filterObj(sharedObj *so, char *soOpts);
+ ~filterObj();
+
+ void addHit(char direction,
+ uint32 id1,
+ uint32 pos1,
+ uint32 len1,
+ uint32 id2,
+ uint32 pos2,
+ uint32 len2,
+ uint32 filled);
+ void filter(void);
+ uint64 output(FILE *, uint64);
+
+private:
+ char *soOpts;
+ sharedObj *so;
+
+ void *handle;
+
+ void * (*soconstruct)(char *);
+ void (*sodestruct)(void *);
+ void (*soaddHit)(void *, char, uint32, uint32, uint32, uint32, uint32, uint32, uint32);
+ void (*sofilter)(void *);
+ uint64 (*sooutput)(void *, FILE *, uint64);
+
+ uint32 theOutputPos;
+ uint32 theOutputMax;
+ char *theOutput;
+
+ char name1[32];
+ char name2[32];
+
+ friend class statObj;
+};
+
+
+
+inline
+filterObj::filterObj(sharedObj *so_, char *op_) {
+ soOpts = 0L;
+ so = so_;
+
+ handle = 0L;
+
+ soconstruct = 0L;
+ sodestruct = 0L;
+ soaddHit = 0L;
+ sofilter = 0L;
+ sooutput = 0L;
+
+ theOutputPos = 0;
+ theOutputMax = 0;
+ theOutput = 0L;
+
+ strcpy(name1, "UNK");
+ strcpy(name2, "UNK");
+
+ if (op_) {
+ soOpts = new char [strlen(op_) + 1];
+ strcpy(soOpts, op_);
+
+ splitToWords W(soOpts);
+
+ for (uint32 arg = 0; arg < W.numWords(); arg++) {
+ if (strcmp(W.getWord(arg), "-1") == 0) {
+ strncpy(name1, W.getWord(++arg), 31);
+ } else if (strcmp(W.getWord(arg), "-2") == 0) {
+ strncpy(name2, W.getWord(++arg), 31);
+ }
+ }
+ }
+
+ if (so) {
+ soconstruct = (void* (*)(char *))so->get("construct");
+ sodestruct = (void (*)(void*))so->get("destruct");
+ soaddHit = (void (*)(void *, char, uint32, uint32, uint32, uint32, uint32, uint32, uint32))so->get("addHit");
+ sofilter = (void (*)(void*))so->get("filter");
+ sooutput = (uint64 (*)(void*,FILE*,uint64))so->get("output");
+
+ if (!soconstruct) fprintf(stderr, "construct not found!\n");
+ if (!sodestruct) fprintf(stderr, "destruct not found!\n");
+ if (!soaddHit) fprintf(stderr, "addHit not found!\n");
+ if (!sofilter) fprintf(stderr, "filter not found!\n");
+ if (!sooutput) fprintf(stderr, "output not found!\n");
+
+ handle = (*soconstruct)(soOpts);
+ }
+
+ if (!so) {
+ theOutputPos = 0;
+ theOutputMax = 1048576;
+ theOutput = new char [theOutputMax];
+ theOutput[0] = 0;
+ }
+}
+
+inline
+filterObj::~filterObj() {
+ if (sodestruct)
+ (*sodestruct)(handle);
+ delete [] soOpts;
+ delete [] theOutput;
+}
+
+
+inline
+void
+filterObj::addHit(char orientation,
+ uint32 id1,
+ uint32 pos1,
+ uint32 len1,
+ uint32 id2,
+ uint32 pos2,
+ uint32 len2,
+ uint32 filled) {
+
+ if (soaddHit) {
+ (*soaddHit)(handle, orientation, id1, pos1, len1, id2, pos2, len2, filled);
+ } else {
+ if (theOutputPos + 128 >= theOutputMax) {
+ theOutputMax <<= 1;
+ char *o = 0L;
+ try {
+ o = new char [theOutputMax];
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "hitMatrix::filter()-- caught std::bad_alloc in %s at line %d\n", __FILE__, __LINE__);
+ fprintf(stderr, "hitMatrix::filter()-- tried to extend output string from "uint32FMT" to "uint32FMT" bytes.\n", theOutputPos, theOutputMax);
+ exit(1);
+ }
+ memcpy(o, theOutput, theOutputPos);
+ delete [] theOutput;
+ theOutput = o;
+ }
+
+ sprintf(theOutput + theOutputPos,
+ "M x ############ . %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" %s "uint32FMT"\n",
+ name1, id1, pos1, len1, name2, id2, pos2, len2, (orientation == 'f') ? "1" : "-1", filled);
+
+ while (theOutput[theOutputPos])
+ theOutputPos++;
+ }
+}
+
+
+
+inline
+void
+filterObj::filter(void) {
+
+ if (sofilter) {
+ (*sofilter)(handle);
+ }
+}
+
+inline
+uint64
+filterObj::output(FILE *F, uint64 matchid) {
+ if (sooutput) {
+ matchid = (*sooutput)(handle, F, matchid);
+ } else {
+ char matchIDstring[32] = {0};
+
+ // Insert the match id's for all these matches. We have to
+ // do this here (not during searches) because we're threaded.
+ //
+ char *pos = theOutput;
+ while (*pos) {
+
+ // Construct a string holding the text version of the match id.
+ //
+ matchid++;
+ sprintf(matchIDstring, uint64FMT, matchid);
+
+ // At the start of an output record. Skip the row type and
+ // sub type, 'M x ', which should put us at the start of
+ // the match id.
+ //
+ pos += 4;
+
+ // Copy the number into the space, removing any extra #
+ // marks, warning if we run out of space.
+ //
+ char *matchIDiterator = matchIDstring;
+ while ((*pos == '#') && (*matchIDiterator != 0))
+ *pos++ = *matchIDiterator++;
+
+ while (*pos == '#')
+ *pos++ = ' ';
+
+ if (*matchIDiterator != 0)
+ fprintf(stderr, "WARNING: there isn't enough space in the match to insert the match id "uint64FMT" '%s'!\n",
+ matchid, matchIDstring);
+
+ // Skip to the next record
+ //
+ while (*pos++ != '\n')
+ ;
+ }
+
+ fwrite(theOutput, sizeof(char), theOutputPos, F);
+ }
+
+ return(matchid);
+}
+
+
+#endif // FILTEROBJ_H
diff --git a/seatac/heavychains-driver.C b/seatac/heavychains-driver.C
new file mode 100644
index 0000000..196da1e
--- /dev/null
+++ b/seatac/heavychains-driver.C
@@ -0,0 +1,183 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Copyright (c) 2005 The J. Craig Venter Institute
+// Author: Clark Mobarry
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "heavychains.H"
+
+#define BUFFERSIZE 1024
+
+int main(int argc, char *argv[]) {
+ int beVerbose = 0;
+ char *assemblyId1 = 0L;
+ char *assemblyId2 = 0L;
+ double minScore = 100.0; // Default minimum of bp filled in a good run.
+ int maxJump = 100000; // Default maximum intra-run jump allowed in a good run.
+ char *inFileName = 0L;
+ char *outFileName = 0L;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-v") == 0) {
+ beVerbose++;
+ } else if (strcmp(argv[arg], "-1") == 0) {
+ assemblyId1 = argv[++arg];
+ } else if (strcmp(argv[arg], "-2") == 0) {
+ assemblyId2 = argv[++arg];
+ } else if (strcmp(argv[arg], "-s") == 0) {
+ minScore = atof(argv[++arg]);
+ } else if (strcmp(argv[arg], "-j") == 0) {
+ maxJump = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-i") == 0) {
+ inFileName = argv[++arg];
+ } else if (strcmp(argv[arg], "-o") == 0) {
+ outFileName = argv[++arg];
+ } else {
+ fprintf(stderr,"%s : unknown flag '-%s'\n", argv[0], *argv);
+ }
+ }
+
+ FILE *inpF = fopen(inFileName, "r");
+ FILE *outF = fopen(outFileName, "w");
+
+ fprintf(outF,"! format atac 1.0\n");
+
+ int old_stra1 = -1;
+ int old_stra2 = -1; // True strand ordinals are non-negative.
+ char linebuffer[BUFFERSIZE] = {0};
+ long matchid = 0;
+
+ StrandPair *sp = new StrandPair(beVerbose, assemblyId1, assemblyId2, maxJump, minScore);
+ TheStats *ts = new TheStats(beVerbose, assemblyId1, assemblyId2, maxJump, minScore);
+
+ bool endOfInput = false;
+
+ while (!endOfInput) {
+ endOfInput = true;
+
+ int new_stra1 = -1;
+ int new_stra2 = -1;
+ int xln = 0;
+ int yln = 0;
+ int tmp_xlo = 0;
+ int tmp_ylo = 0;
+ int tmp_filled = 0; // This is never changed!
+ char tmp_ori = 0;
+
+ if (fgets(linebuffer, BUFFERSIZE, inpF)) {
+ endOfInput = false;
+
+ if(linebuffer[0] == 'M') {
+ char classCode;
+ char subtype;
+ char selfId[100];
+ char parentId[100];
+ char new_ass1[100];
+ char new_ass2[100];
+ int xfl;
+ int yfl;
+
+ if (12 != sscanf(linebuffer,
+ "%c %c %s %s %s %d %d %d %s %d %d %d\n",
+ &classCode,
+ &subtype,
+ selfId,
+ parentId,
+ new_ass1,
+ &tmp_xlo,
+ &xln,
+ &xfl,
+ new_ass2,
+ &tmp_ylo,
+ &yln,
+ &yfl)) {
+ fprintf(stderr, "WARNING: short read on '%s'\n", linebuffer);
+ }
+
+#if 0
+ printf("classCode=%c\n", classCode);
+ printf("subtype =%c\n", subtype);
+ printf("selfId =%s\n", selfId);
+ printf("parentId =%s\n", parentId);
+ printf("new_ass1 =%s\n", new_ass1);
+ printf("xfl =%d\n", xfl);
+ printf("new_ass2 =%s\n", new_ass2);
+ printf("yfl =%d\n", yfl);
+#endif
+
+ if ((xfl != 1 && xfl != -1) ||
+ (yfl != 1 && yfl != -1)) {
+ fprintf(stderr, "ERROR: orientation wrong.\n%s\n", linebuffer);
+ exit(1);
+ }
+
+ tmp_ori = (xfl == yfl ? 'f' : 'r');
+
+ // Parse the IID out of the ID
+ //
+ for (char *p = new_ass1; *p; p++)
+ if (*p == ':')
+ new_stra1 = atoi(p+1);
+
+ for (char *p = new_ass2; *p; p++)
+ if (*p == ':')
+ new_stra2 = atoi(p+1);
+
+ } else if ((linebuffer[0] == '#') || (linebuffer[0] == '!') || (linebuffer[0] == '/')) {
+ fprintf(stderr,"%s",linebuffer);
+ } else {
+ fprintf(stderr, "UNRECOGNIZED: %s", linebuffer);
+ }
+ }
+
+ if ((new_stra1 != old_stra1) ||
+ (new_stra2 != old_stra2) || endOfInput) {
+ sp->process();
+ matchid = sp->print(outF, matchid);
+
+ ts->add(sp);
+
+ delete sp;
+ sp = new StrandPair(beVerbose, assemblyId1, assemblyId2, maxJump, minScore);
+ }
+
+ // Add the hit to the sp if we just read a point
+ //
+ if (linebuffer[0] == 'M') {
+ sp->addHit(tmp_ori,
+ new_stra1, tmp_xlo, xln,
+ new_stra2, tmp_ylo, yln,
+ tmp_filled);
+
+ old_stra1 = new_stra1;
+ old_stra2 = new_stra2;
+ }
+ }
+
+ ts->add(sp);
+ ts->show(outF);
+
+ delete sp;
+ delete ts;
+
+ fclose(inpF);
+ fclose(outF);
+}
diff --git a/seatac/heavychains.C b/seatac/heavychains.C
new file mode 100644
index 0000000..7445666
--- /dev/null
+++ b/seatac/heavychains.C
@@ -0,0 +1,191 @@
+// This file is part of A2Amapper.
+// Copyright (c) 2004 Applera Corporation
+// Copyright (c) 2005 The J. Craig Venter Institute
+// Author: Clark Mobarry
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <errno.h>
+#include "heavychains.H"
+
+
+
+// The following would need to parameterized for a general kD tree.
+// Could we use one function with a static variable to remember the
+// sorting direction?
+//
+int x_compar(const void *x,const void *y) {
+ const Match &p1=*((const Match*)x);
+ const Match &p2=*((const Match*)y);
+ if (p1.xhi < p2.xhi) return -1;
+ if (p1.xhi > p2.xhi) return 1;
+ return 0;
+}
+
+int y_compar(const void *x,const void *y) {
+ const Match &p1=*((const Match*)x);
+ const Match &p2=*((const Match*)y);
+ if (p1.yhi < p2.yhi) return -1;
+ if (p1.yhi > p2.yhi) return 1;
+ return 0;
+}
+
+
+
+void StrandPair::addHit(char direction,
+ uint32 id1,
+ uint32 xlo,
+ uint32 xln,
+ uint32 id2,
+ uint32 ylo,
+ uint32 yln,
+ uint32 filled) {
+ Match tmp;
+
+ tmp.xlo = xlo;
+ tmp.ylo = ylo;
+
+ tmp.xhi = xlo + xln;
+ tmp.yhi = ylo + yln;
+
+ // Use the match lengths to initialize the self scores.
+ tmp.selfS = xln;
+ if (yln < xln)
+ tmp.selfS = yln;
+
+ tmp.S = 0.0;
+ tmp.neS = 0;
+ tmp.nwS = 0;
+ tmp.seS = 0;
+ tmp.swS = 0;
+
+ tmp.filled = filled;
+ tmp.ori = direction;
+
+ iid1 = id1;
+ iid2 = id2;
+
+ if (beVerbose > 1)
+ fprintf(stderr, "heavychains: add %8d %8d %8d -- %8d %8d %8d\n", id1, tmp.xlo, tmp.xhi, id2, tmp.ylo, tmp.yhi);
+
+ Padd(&tmp);
+}
+
+
+
+// new strand pair: begin processing data for the strand pair
+//
+void StrandPair::process(void) {
+ int swapi;
+
+ if (Plen > 0) {
+ if (beVerbose > 0)
+ fprintf(stderr,"HeavyChains: filtering strands "uint32FMT" "uint32FMT" "uint32FMT"\n", iid1, iid2, Plen);
+
+ DPTree *dp = NULL;
+ dp = new DPTree(Plen, P);
+ dp->setParams(maxJump);
+
+ for(int quadrant=0; quadrant < 4; ++quadrant) {
+ if (beVerbose > 1)
+ fprintf(stderr,"HeavyChains: arranging process quadrant %d\n", quadrant);
+
+ if ((quadrant == 0) || (quadrant == 2)) {
+ for (int i=0; i<Plen; ++i) {
+ swapi = -P[i].xlo;
+ P[i].xlo = -P[i].xhi;
+ P[i].xhi = swapi;
+ }
+ } else {
+ for (int i=0; i<Plen; ++i) {
+ swapi = -P[i].ylo;
+ P[i].ylo = -P[i].yhi;
+ P[i].yhi = swapi;
+ }
+ }
+
+ if (beVerbose > 1)
+ fprintf(stderr,"HeavyChains: scoring quadrant\n");
+
+ dp->treeScore();
+
+ if (beVerbose>1)
+ fprintf(stderr,"HeavyChains: recording scores\n");
+
+ switch(quadrant) {
+ case 0: for (int i=0; i < Plen; ++i) P[i].nwS = P[i].S; break;
+ case 1: for (int i=0; i < Plen; ++i) P[i].swS = P[i].S; break;
+ case 2: for (int i=0; i < Plen; ++i) P[i].seS = P[i].S; break;
+ case 3: for (int i=0; i < Plen; ++i) P[i].neS = P[i].S; break;
+ }
+
+ if (beVerbose > 1)
+ fprintf(stderr,"HeavyChains: done quadrant\n");
+ }
+
+ // All output information is now in the match records of P.
+ delete dp;
+ }
+}
+
+
+
+uint64
+StrandPair::print(FILE *outF,
+ uint64 matchid) {
+
+ for (int i=0; i<Plen; ++i) {
+
+ // symmetrize the forward and backward scores
+ double inc = P[i].neS + P[i].swS - P[i].selfS; // forward complement orientations
+ double dec = P[i].seS + P[i].nwS - P[i].selfS; // reverse complement orientations
+
+ // Each score already contains the self score
+
+ if ((inc >= minScore) || (dec >= minScore)) {
+ int len1 = (P[i].xhi-P[i].xlo);
+ int len2 = (P[i].yhi-P[i].ylo);
+ matchid++;
+
+ if (beVerbose > 1)
+ fprintf(stderr, "heavychains: out "uint32FMTW(8)" %8d %8d -- "uint32FMTW(8)" %8d %8d\n",
+ iid1, P[i].xlo, P[i].xhi,
+ iid2, P[i].ylo, P[i].yhi);
+
+ errno = 0;
+ fprintf(outF, "M x H"uint64FMT" . %s:"uint32FMT" %d %d %d %s:"uint32FMT" %d %d %d > /hf=%.1f /hr=%.1f\n",
+ matchid,
+ assemblyId1, iid1, P[i].xlo, len1, 1,
+ assemblyId2, iid2, P[i].ylo, len2, (P[i].ori == 'f'? 1 : -1),
+ inc, dec);
+ if (errno)
+ fprintf(stderr, "StrandPair::print()-- write failed: %s\n", strerror(errno));
+
+ sumlen1 += len1;
+ sumlen2 += len2;
+ maxlen1 = (maxlen1 > len1) ? maxlen1 : len1;
+ maxlen2 = (maxlen2 > len2) ? maxlen2 : len2;
+ maxScoreFwd = (maxScoreFwd > inc) ? maxScoreFwd : inc;
+ maxScoreRev = (maxScoreRev > dec) ? maxScoreRev : dec;
+ }
+
+ if (beVerbose > 0)
+ fprintf(stderr, "HeavyChains: finished strands "uint32FMTW(8)" "uint32FMTW(8)" maxlen1=%f maxlen2=%f maxScoreFwd=%f maxScoreRef=%f\n",
+ iid1, iid2, maxlen1, maxlen2, maxScoreFwd, maxScoreRev);
+ }
+
+ return(matchid);
+}
diff --git a/seatac/heavychains.H b/seatac/heavychains.H
new file mode 100644
index 0000000..e6cf8f9
--- /dev/null
+++ b/seatac/heavychains.H
@@ -0,0 +1,462 @@
+#ifndef STRANDPAIR_H
+#define STRANDPAIR_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <vector>
+using namespace std;
+
+#include "util.h"
+
+//
+// The StrandPair does the heavy chains filtering, while the TheStats
+// collects statistics on all StrandPairs. TheStats is also
+// responsible for reporting the options used by StrandPair.
+//
+
+
+struct Match {
+ int xlo;
+ int ylo;
+ int xhi;
+ int yhi;
+
+ double selfS; // The intrinsic score of the Match.
+ double S; // The computed score of the Match?
+
+ // We need two times the number of dimensions of scores. That is
+ // one score starting from each corner of the bounding box of the
+ // space.
+ //
+ double neS;
+ double swS;
+ double nwS;
+ double seS;
+
+ int filled; // Is this the same as selfS ?
+
+ char ori;
+};
+
+
+
+class StrandPair {
+public:
+ StrandPair(bool verbose,
+ char *assemblyid1,
+ char *assemblyid2,
+ int maxjump,
+ double minscore) {
+ beVerbose = verbose;
+ strncpy(assemblyId1, assemblyid1, 31);
+ strncpy(assemblyId2, assemblyid2, 31);
+ maxJump = maxjump;
+ minScore = minscore;
+
+ Plen = 0;
+ Pmax = 1024;
+ P = new Match [Pmax];
+
+ clear();
+ };
+
+ ~StrandPair(void) {
+ if (beVerbose > 1)
+ fprintf(stderr, "StrandPair::StrandPair()-- delete %s vs %s with %d hits\n", assemblyId1, assemblyId2, Plen);
+ delete [] P;
+ };
+
+ void addHit(char direction,
+ uint32 id1,
+ uint32 xlo,
+ uint32 xln,
+ uint32 id2,
+ uint32 ylo,
+ uint32 yln,
+ uint32 filled);
+
+ void process(void);
+
+ uint64 print(FILE *outF, uint64 matchid);
+
+ void clear(void) {
+ iid1 = ~uint32ZERO;
+ iid2 = ~uint32ZERO;
+
+ _next = 0L;
+
+ sumlen1 = 0.0;
+ sumlen2 = 0.0;
+ maxlen1 = 0.0;
+ maxlen2 = 0.0;
+ maxScoreFwd = 0.0; // Maximum forward chain score for the strand pair.
+ maxScoreRev = 0.0; // Maximum reverse chain score for the strand pair.
+
+ Plen = 0;
+ };
+
+ // The StrandPairManager (in filter-strandpair.H) is lazy and makes
+ // StrandPairs keep track of the next one.
+ //
+ StrandPair *_next;
+ StrandPair *next(void) { return(_next); };
+ void addNext(StrandPair *n) { _next = n; };
+
+ uint32 sequenceIID1(void) { return(iid1); };
+ //uint32 sequenceIID2(void) { return(iid2); };
+
+ double getsumlen1(void) const { return(sumlen1); };
+ double getsumlen2(void) const { return(sumlen2); };
+ double getmaxlen1(void) const { return(maxlen1); };
+ double getmaxlen2(void) const { return(maxlen2); };
+ double getmaxScoreFwd(void) const { return(maxScoreFwd); };
+ double getmaxScoreRev(void) const { return(maxScoreRev); };
+
+private:
+ // This used to use a vector<Match>, but DPTree wants a pointer to the
+ // array of matches
+ //
+ int Plen;
+ int Pmax;
+ Match *P;
+
+ void Padd(Match *m) {
+ if (Plen >= Pmax) {
+ Pmax *= 2;
+ Match *n = new Match [Pmax];
+ memcpy(n, P, sizeof(Match) * Plen);
+ delete [] P;
+ P = n;
+ }
+ memcpy(P+Plen, m, sizeof(Match));
+ Plen++;
+ };
+
+private:
+ uint32 iid1;
+ uint32 iid2;
+ uint32 beVerbose;
+ char assemblyId1[32];
+ char assemblyId2[32];
+ int maxJump; // Default maximum intra-run jump allowed in a good run.
+ double minScore; // Default minimum of bp filled in a good run.
+
+ // The following are only known after StrandPair::print().
+
+ double sumlen1;
+ double sumlen2;
+ double maxlen1;
+ double maxlen2;
+ double maxScoreFwd; // Maximum forward chain score for the strand pair.
+ double maxScoreRev; // Maximum reverse chain score for the strand pair.
+};
+
+
+
+
+
+
+class TheStats {
+public:
+ TheStats(bool verbose,
+ char *assemblyid1,
+ char *assemblyid2,
+ int maxjump,
+ double minscore) {
+ beVerbose = verbose;
+ strncpy(assemblyId1, assemblyid1, 31); // Note the cap 'i'
+ strncpy(assemblyId2, assemblyid2, 31);
+ maxJump = maxjump;
+ minScore = minscore;
+
+ sumlen1 = 0.0;
+ sumlen2 = 0.0;
+ sumMaxLen1 = 0.0;
+ sumMaxLen2 = 0.0;
+ sumMaxScoreFwd = 0.0;
+ sumMaxScoreRev = 0.0;
+ };
+
+ void add(StrandPair *sp) {
+ sumlen1 += sp->getsumlen1();
+ sumlen2 += sp->getsumlen2();
+ sumMaxLen1 += sp->getmaxlen1();
+ sumMaxLen2 += sp->getmaxlen2();
+ sumMaxScoreFwd += sp->getmaxScoreFwd();
+ sumMaxScoreRev += sp->getmaxScoreRev();
+ };
+
+ void show(FILE *outfile) {
+ fprintf(outfile, "/assemblyId1=%s\n", assemblyId1);
+ fprintf(outfile, "/assemblyId2=%s\n", assemblyId2);
+ fprintf(outfile, "/heavyMaxJump=%d\n", maxJump);
+ fprintf(outfile, "/heavyMinFill=%f\n", minScore);
+ fprintf(outfile, "/heavySumLen1=%f\n", sumlen1);
+ fprintf(outfile, "/heavySumLen2=%f\n", sumlen2);
+ fprintf(outfile, "/heavySumMaxLen1=%f\n", sumMaxLen1);
+ fprintf(outfile, "/heavySumMaxLen2=%f\n", sumMaxLen2);
+ fprintf(outfile, "/heavySumMaxScoreFwd=%f\n", sumMaxScoreFwd);
+ fprintf(outfile, "/heavySumMaxScoreRev=%f\n", sumMaxScoreRev);
+ };
+
+private:
+ // Parameters to the filter
+ int beVerbose;
+ char assemblyId1[32];
+ char assemblyId2[32];
+ int maxJump;
+ double minScore;
+
+ double sumlen1;
+ double sumlen2;
+ double sumMaxLen1;
+ double sumMaxLen2;
+ double sumMaxScoreFwd;
+ double sumMaxScoreRev;
+};
+
+
+
+
+
+struct Interval {
+ int lo;
+ int hi;
+ double S;
+
+ Interval() {};
+ // This is an explicit redefinition of the default constructor.
+};
+
+int x_compar(const void *x,const void *y);
+int y_compar(const void *x,const void *y);
+
+class DPTree {
+ Interval *node;
+ Match *match;
+ int node_size;
+ int match_size; // The number of matches stored in the tree.
+
+ // DP parameters
+ int MaxJump;
+
+ struct kd_node {
+ bool Xy;
+ int start,stop; // The indices to define a segment of the vector.
+ int intv; // some index
+ kd_node() {};
+ // This is an explicit redefinition of the default constructor.
+
+ inline int nmatches() const {return stop-start;}
+ inline int midpoint() const {return (start+stop+1)/2;}
+ // This is the midpoint of the match list, not the midpoint of the region.
+
+ inline bool lesser_is_node() const {return nmatches() > 2;}
+ inline bool greater_is_node() const {return nmatches() > 3;}
+
+ inline bool X() const {return Xy;}
+ inline bool Y() const {return !Xy;}
+
+ kd_node lesser_node() const {
+ kd_node ret(*this); // This is an explicit invocation of the copy constructor filled with "this".
+ // Equivalent to: kd_node ret = (*this);
+ ret.intv = ret.intv*2+1;
+ ret.stop = midpoint();
+ ret.Xy = !ret.Xy;
+ return ret;
+ }
+ kd_node greater_node() const {
+ kd_node ret(*this); // This is an explicit invocation of the copy constructor filled with "this".
+ // Equivalent to: kd_node ret = (*this);
+ ret.intv = ret.intv*2+2;
+ ret.start = midpoint();
+ ret.Xy = !ret.Xy;
+ return ret;
+ }
+ inline kd_node child_node(int x) const {
+ if (x < midpoint())
+ return lesser_node();
+ else
+ return greater_node();
+ }
+
+ // root is always real, and everyone else must have at least 2 points
+ inline bool is_real() const {return intv==0 || nmatches() > 1;}
+ };
+
+ kd_node root() const {
+ kd_node ret;
+ ret.start = 0;
+ ret.stop = match_size;
+ ret.intv = 0;
+ ret.Xy = true;
+ return ret; // Return the structure by value.
+ }
+
+public:
+
+ ~DPTree() {
+ delete[] node;
+ }
+
+ DPTree(unsigned siz, Match *p): node(NULL),match(p),
+ node_size(0),match_size(siz) {
+ node_size = 2;
+ for(unsigned sz = match_size; sz>1 ; sz/=2, node_size *= 2);
+ node = new Interval[node_size];
+ }
+
+ inline void setParams(int mj) {
+ MaxJump = mj;
+ }
+
+ double treeScore() {
+ init();
+ if (match_size > 0)
+ privScore(root(),root());
+
+ return node[root().intv].S;
+ }
+
+private:
+
+ inline double pairScore(const Match &pl,const Match &ph) const {
+ const int dx = ph.xlo - pl.xlo;
+ const int dy = ph.ylo - pl.ylo; // causality difference
+
+ const int ix = ph.xlo - pl.xhi;
+ const int iy = ph.ylo - pl.yhi;
+
+ const int smaller_jump = (ix < iy)?ix:iy; // will be < 0 if they intersect
+ const int larger_jump = (ix < iy)?iy:ix; // must be < MaxJump for an interaction
+
+ int intersection = smaller_jump * (smaller_jump < 0);
+
+ return (dx >= 0 && dy >= 0 && larger_jump < MaxJump)
+ * (pl.S + intersection );
+ }
+
+ inline bool pruneScore(kd_node f,
+ const Match &p) const {
+ int d,jd;
+
+ if (f.X()) {
+ jd = p.xlo - node[f.intv].hi;
+ d = p.xlo - node[f.intv].lo;
+ }
+ else {
+ jd = p.ylo - node[f.intv].hi;
+ d = p.ylo - node[f.intv].lo;
+ }
+ // returns true if we really need to check this score
+ return (d >= 0 && jd < MaxJump) && (node[f.intv].S >= p.S);
+ }
+
+ double privScore(kd_node flo,kd_node fhi) {
+ // no longer double recursive -- just iterate through fhi
+ for(int x = fhi.start; x < fhi.stop; ++x) {
+ match[x].S = 0;
+ matchScore(flo,match[x]);
+ match[x].S += match[x].selfS;
+
+ for(kd_node tmp(fhi); tmp.is_real() ; tmp = tmp.child_node(x)) {
+ if (node[tmp.intv].S < match[x].S) node[tmp.intv].S = match[x].S;
+ }
+ }
+ return node[fhi.intv].S;
+ }
+
+ double matchScore(kd_node flo,Match &p) {
+ double score = 0;
+
+ if ( (flo.X() && node[flo.intv].lo <= p.xlo ||
+ flo.Y() && node[flo.intv].lo <= p.ylo)
+ && pruneScore(flo,p) ) {
+
+ if (flo.greater_is_node())
+ score = matchScore(flo.greater_node(),p);
+ else
+ score = pairScore(match[flo.stop-1],p);
+ if (p.S < score) p.S = score;
+
+ if (flo.lesser_is_node())
+ score = matchScore(flo.lesser_node(),p);
+ else
+ score = pairScore(match[flo.start],p);
+
+ if (p.S < score) p.S = score;
+
+ }
+ return p.S;
+ }
+
+ void init() {
+ if (match_size > 0){ sort_nodes(root());}
+
+ int minx=0,miny=0,maxx=0,maxy=0; // initial values will be overwritten
+ get_bbox(root(),minx,miny,maxx,maxy);
+
+ for (int i=0; i < node_size; ++i) node[i].S = -1;
+ for (int i=0; i < match_size; ++i) match[i].S = -1;
+ }
+
+ void sort_nodes(kd_node fs) {
+ if (fs.intv >= node_size) {
+ fprintf(stderr,"overflow %d %d\n",fs.intv,node_size);
+ }
+
+ qsort( match+fs.start, fs.nmatches(), sizeof(Match),
+ (fs.X()?x_compar:y_compar) );
+
+ if (fs.greater_is_node()) sort_nodes(fs.greater_node());
+ if (fs.lesser_is_node()) sort_nodes(fs.lesser_node());
+ }
+
+ void get_bbox(kd_node fs,int &minx,int &miny,int &maxx,int &maxy) {
+ int lminx,lminy,lmaxx,lmaxy;
+ int gminx,gminy,gmaxx,gmaxy;
+
+ if (fs.lesser_is_node()) {
+ get_bbox(fs.lesser_node(),lminx,lminy,lmaxx,lmaxy);
+ }
+ else {
+ lminx = match[fs.start].xlo;
+ lmaxx = match[fs.start].xhi;
+ lminy = match[fs.start].ylo;
+ lmaxy = match[fs.start].yhi;
+ }
+
+ if (fs.greater_is_node()) {
+ get_bbox(fs.greater_node(),gminx,gminy,gmaxx,gmaxy);
+ }
+ else {
+ gminx = match[fs.stop-1].xlo;
+ gmaxx = match[fs.stop-1].xhi;
+ gminy = match[fs.stop-1].ylo;
+ gmaxy = match[fs.stop-1].yhi;
+ }
+
+ miny = (lminy < gminy)?lminy:gminy;
+ minx = (lminx < gminx)?lminx:gminx;
+ maxy = (lmaxy > gmaxy)?lmaxy:gmaxy;
+ maxx = (lmaxx > gmaxx)?lmaxx:gmaxx;
+
+ if (fs.X()) {
+ node[fs.intv].lo = minx;
+ node[fs.intv].hi = maxx;
+ }
+ else {
+ node[fs.intv].lo = miny;
+ node[fs.intv].hi = maxy;
+ }
+
+ }
+};
+
+
+
+
+
+#endif // STRANDPAIR_H
diff --git a/seatac/hitMatrix-sort.C b/seatac/hitMatrix-sort.C
new file mode 100644
index 0000000..714fffb
--- /dev/null
+++ b/seatac/hitMatrix-sort.C
@@ -0,0 +1,82 @@
+#include "hitMatrix.H"
+
+// Sort by dsPos
+
+inline
+void
+adjustHeap_dsPos(diagonalLine *L, uint32 p, uint32 n) {
+ uint32 q = L[p]._qsPos;
+ uint32 d = L[p]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ uint32 l = L[p]._diagonalID;
+#endif
+ uint32 c = (p << 1) + 1; // let c be the left child of p
+
+ while (c < n) {
+
+ // Find the larger of the two children
+ //
+ if ((c+1 < n) && (L[c]._dsPos < L[c+1]._dsPos))
+ c++;
+
+ // Does the node in question fit here?
+ //
+ if (d >= L[c]._dsPos)
+ break;
+
+ // Else, swap the parent and the child
+ //
+ L[p]._qsPos = L[c]._qsPos;
+ L[p]._dsPos = L[c]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ L[p]._diagonalID = L[c]._diagonalID;
+#endif
+
+ // Move down the tree
+ //
+ p = c;
+ c = (p << 1) + 1;
+ }
+
+ L[p]._qsPos = q;
+ L[p]._dsPos = d;
+#ifndef WITHOUT_DIAGONALID
+ L[p]._diagonalID = l;
+#endif
+}
+
+void
+hitMatrix::sort_dsPos(void) {
+
+ if (_hitsLen > 1) {
+
+ // Create the heap of lines.
+ //
+ for (uint32 i=_hitsLen/2; i--; )
+ adjustHeap_dsPos(_hits, i, _hitsLen);
+
+ // Interchange the new maximum with the element at the end of the tree
+ //
+ for (uint32 i=_hitsLen-1; i>0; i--) {
+ uint32 q = _hits[i]._qsPos;
+ uint32 d = _hits[i]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ uint32 l = _hits[i]._diagonalID;
+#endif
+
+ _hits[i]._qsPos = _hits[0]._qsPos;
+ _hits[i]._dsPos = _hits[0]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ _hits[i]._diagonalID = _hits[0]._diagonalID;
+#endif
+
+ _hits[0]._qsPos = q;
+ _hits[0]._dsPos = d;
+#ifndef WITHOUT_DIAGONALID
+ _hits[0]._diagonalID = l;
+#endif
+
+ adjustHeap_dsPos(_hits, 0, i);
+ }
+ }
+}
diff --git a/seatac/hitMatrix.C b/seatac/hitMatrix.C
new file mode 100644
index 0000000..e0c9266
--- /dev/null
+++ b/seatac/hitMatrix.C
@@ -0,0 +1,367 @@
+#include "seatac.H"
+
+
+hitMatrix::hitMatrix(uint32 qsLen, uint32 qsIdx) {
+ _qsLen = qsLen;
+ _qsIdx = qsIdx;
+
+ // Because this is doing scaffolds or chromosomes against more than
+ // 1/4 a genome, we expect a LOT of hits. Start off with a good
+ // amount of memory.
+ //
+ // At 8 bytes per diagonalLine, 128M of these is 1GB. Which works
+ // great for aligning mamalian chromosomes and stinks for microbes.
+ //
+ _hitsLen = 0;
+ _hitsMax = 32 * 1024 * 1024;
+ _hits = new diagonalLine [_hitsMax];
+}
+
+
+hitMatrix::~hitMatrix() {
+ delete [] _hits;
+}
+
+
+// Utility for sorting the diagonal lines in the hitMatrix
+//
+// The two comparison functions return true if the first line
+// is less than the second line.
+
+#ifdef WITHOUT_DIAGONALID
+
+inline
+int
+compareLines(diagonalLine *A, diagonalLine *B, uint32 qsLen) {
+ uint32 a = qsLen - A->_qsPos - 1 + A->_dsPos;
+ uint32 b = qsLen - B->_qsPos - 1 + B->_dsPos;
+
+ return(((a < b)) ||
+ ((a == b) && (A->_qsPos < B->_qsPos)));
+}
+
+inline
+int
+compareLines(uint32 l, uint32 q, diagonalLine *B, uint32 qsLen) {
+ uint32 b = qsLen - B->_qsPos - 1 + B->_dsPos;
+
+ return(((l < b)) ||
+ ((l == b) && (q < B->_qsPos)));
+}
+
+inline
+void
+adjustHeap(diagonalLine *L, int32 p, int32 n, uint32 qsLen) {
+ uint32 q = L[p]._qsPos;
+ uint32 d = L[p]._dsPos;
+ uint32 l = qsLen - q - 1 + d;
+ int32 c = (p << 1) + 1; // let c be the left child of p
+
+ while (c < n) {
+
+ // Find the larger of the two children
+ //
+ if ((c+1 < n) && compareLines(L+c, L+c+1, qsLen))
+ c++;
+
+ // Does the node in question fit here?
+ //
+ if (compareLines(l, q, L+c, qsLen) == false)
+ break;
+
+ // Else, swap the parent and the child
+ //
+ L[p]._qsPos = L[c]._qsPos;
+ L[p]._dsPos = L[c]._dsPos;
+
+ // Move down the tree
+ //
+ p = c;
+ c = (p << 1) + 1;
+ }
+
+ L[p]._qsPos = q;
+ L[p]._dsPos = d;
+}
+
+
+#else // WITH_DIAGONALID
+
+
+inline
+int
+compareLines(diagonalLine *A, diagonalLine *B) {
+ return(((A->_diagonalID < B->_diagonalID)) ||
+ ((A->_diagonalID == B->_diagonalID) && (A->_qsPos < B->_qsPos)));
+}
+
+inline
+int
+compareLines(uint32 l, uint32 q, diagonalLine *B) {
+ return(((l < B->_diagonalID)) ||
+ ((l == B->_diagonalID) && (q < B->_qsPos)));
+}
+
+inline
+void
+adjustHeap(diagonalLine *L, int32 p, int32 n) {
+ uint32 q = L[p]._qsPos;
+ uint32 d = L[p]._dsPos;
+ uint32 l = L[p]._diagonalID;
+ int32 c = (p << 1) + 1; // let c be the left child of p
+
+ while (c < n) {
+
+ // Find the larger of the two children
+ //
+ if ((c+1 < n) && compareLines(L+c, L+c+1))
+ c++;
+
+ // Does the node in question fit here?
+ //
+ if (compareLines(l, q, L+c) == false)
+ break;
+
+ // Else, swap the parent and the child
+ //
+ L[p]._qsPos = L[c]._qsPos;
+ L[p]._dsPos = L[c]._dsPos;
+ L[p]._diagonalID = L[c]._diagonalID;
+
+ // Move down the tree
+ //
+ p = c;
+ c = (p << 1) + 1;
+ }
+
+ L[p]._qsPos = q;
+ L[p]._dsPos = d;
+ L[p]._diagonalID = l;
+}
+
+
+#endif
+
+
+
+
+
+
+void
+hitMatrix::processMatrix(char direction, filterObj *FO) {
+
+ if (_hitsLen == 0)
+ return;
+
+ // First, sort by the dsPos. This is done so that we can find all the hits for
+ // a specific scaffold.
+ //
+ sort_dsPos();
+
+
+ merCovering IL(config._merSize);
+ uint32 ILlength = 0;
+
+ // Now, while there are hits left....
+ //
+ uint32 firstHit = 0;
+ uint32 lastHit = 0;
+ uint32 currentSeq = 0;
+
+ while (firstHit < _hitsLen) {
+
+ // Move the currentSeq until the firstHit is below it.
+ //
+ while ((currentSeq < config._genome->numberOfSequences()) &&
+ (config._genome->startOf(currentSeq) <= _hits[firstHit]._dsPos))
+ currentSeq++;
+
+ //
+ // currentSeq is now the sequence AFTER the one that we want hits in.
+ //
+
+ // Find the first hit that is in currentSeq. If this is the last sequence,
+ // then, of course, all remaining hits are in it.
+ //
+ if (currentSeq < config._genome->numberOfSequences()) {
+ lastHit = firstHit + 1;
+ while ((lastHit < _hitsLen) &&
+ (_hits[lastHit]._dsPos < config._genome->startOf(currentSeq)))
+ lastHit++;
+ } else {
+ lastHit = _hitsLen;
+ }
+
+ // Drop back one sequence; this is the sequence the hits are in.
+ //
+ currentSeq--;
+
+
+ // Adjust the hits to be relative to the start of this sequence
+ //
+ for (uint32 i=firstHit; i<lastHit; i++)
+ _hits[i]._dsPos -= config._genome->startOf(currentSeq);
+
+ // Sort them, if needed.
+ //
+ if (lastHit - firstHit > 1) {
+
+ // We cheat; heapsort isn't too friendly to sorting the middle of
+ // an array, so we make a new array in the middle!
+ //
+ diagonalLine *hitsToSort = _hits + firstHit;
+
+ // Build the heap. I initially thought this could be done at the
+ // same time as the scan for the last hit, but it can't (easily)
+ //
+ for (int32 i=(lastHit - firstHit)/2 - 1; i>=0; i--)
+#ifdef WITHOUT_DIAGONALID
+ adjustHeap(hitsToSort, i, lastHit - firstHit, _qsLen);
+#else
+ adjustHeap(hitsToSort, i, lastHit - firstHit);
+#endif
+
+ // Sort the hits be diagonal. This is the second part of
+ // heap sort -- Interchange the new maximum with the element
+ // at the end of the tree
+ //
+ for (uint32 i=lastHit - firstHit - 1; i>0; i--) {
+ uint32 q = hitsToSort[i]._qsPos;
+ uint32 d = hitsToSort[i]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ uint32 l = hitsToSort[i]._diagonalID;
+#endif
+
+ hitsToSort[i]._qsPos = hitsToSort[0]._qsPos;
+ hitsToSort[i]._dsPos = hitsToSort[0]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ hitsToSort[i]._diagonalID = hitsToSort[0]._diagonalID;
+#endif
+
+ hitsToSort[0]._qsPos = q;
+ hitsToSort[0]._dsPos = d;
+#ifndef WITHOUT_DIAGONALID
+ hitsToSort[0]._diagonalID = l;
+#endif
+
+#ifdef WITHOUT_DIAGONALID
+ adjustHeap(hitsToSort, 0, i, _qsLen);
+#else
+ adjustHeap(hitsToSort, 0, i);
+#endif
+ }
+ }
+
+
+ // Filter them
+ //
+#ifdef WITHOUT_DIAGONALID
+ uint32 lastDiagonal = _qsLen - _hits[firstHit]._qsPos - 1 + _hits[firstHit]._dsPos;
+#else
+ uint32 lastDiagonal = _hits[firstHit]._diagonalID;
+#endif
+ uint32 qsLow = _hits[firstHit]._qsPos;
+ uint32 qsHigh = _hits[firstHit]._qsPos;
+ uint32 dsLow = _hits[firstHit]._dsPos;
+ uint32 dsHigh = _hits[firstHit]._dsPos;
+
+ IL.clear();
+
+ for (uint32 i=firstHit; i<lastHit; i++) {
+ //fprintf(stdout, "hit[%6u] seq=%8u qs=%5u ds=%5u\n", i, currentSeq, _hits[i]._qsPos, _hits[i]._dsPos);
+
+ //
+ // Extend if on the same diagonal, and consecutive sequence.
+ //
+ if ((lastDiagonal ==
+#ifdef WITHOUT_DIAGONALID
+ (_qsLen - _hits[i]._qsPos - 1 + _hits[i]._dsPos)
+#else
+ _hits[i]._diagonalID
+#endif
+ ) &&
+ (qsLow <= _hits[i]._qsPos) &&
+ (_hits[i]._qsPos <= qsHigh + config._merSize + config._maxGap)) {
+ if (qsLow > _hits[i]._qsPos) qsLow = _hits[i]._qsPos;
+ if (qsHigh < _hits[i]._qsPos) qsHigh = _hits[i]._qsPos;
+ if (dsLow > _hits[i]._dsPos) dsLow = _hits[i]._dsPos;
+ if (dsHigh < _hits[i]._dsPos) dsHigh = _hits[i]._dsPos;
+ IL.addMer(_hits[i]._qsPos);
+ } else {
+
+ //
+ // Save the match. cut-n-paste with below.
+ //
+
+ ILlength = IL.sumOfLengths();
+ IL.clear();
+
+ if (ILlength >= config._minLength) {
+ if (direction == 'r') {
+ FO->addHit(direction,
+ config._genome->IIDOf(currentSeq),
+ dsLow,
+ dsHigh - dsLow + config._merSize,
+ _qsIdx,
+ _qsLen - qsHigh - config._merSize,
+ qsHigh - qsLow + config._merSize,
+ ILlength);
+ } else {
+ FO->addHit(direction,
+ config._genome->IIDOf(currentSeq),
+ dsLow,
+ dsHigh - dsLow + config._merSize,
+ _qsIdx,
+ qsLow,
+ qsHigh - qsLow + config._merSize,
+ ILlength);
+ }
+ }
+
+#ifdef WITHOUT_DIAGONALID
+ lastDiagonal = _qsLen - _hits[i]._qsPos - 1 + _hits[i]._dsPos;
+#else
+ lastDiagonal = _hits[i]._diagonalID;
+#endif
+ qsLow = _hits[i]._qsPos;
+ qsHigh = _hits[i]._qsPos;
+ dsLow = _hits[i]._dsPos;
+ dsHigh = _hits[i]._dsPos;
+ IL.addMer(_hits[i]._qsPos);
+ }
+ }
+
+ // Save the final cluster? (cut-n-paste from above)
+ //
+ ILlength = IL.sumOfLengths();
+ IL.clear();
+
+ if (ILlength >= config._minLength) {
+ if (direction == 'r') {
+ FO->addHit(direction,
+ config._genome->IIDOf(currentSeq),
+ dsLow,
+ dsHigh - dsLow + config._merSize,
+ _qsIdx,
+ _qsLen - qsHigh - config._merSize,
+ qsHigh - qsLow + config._merSize,
+ ILlength);
+ } else {
+ FO->addHit(direction,
+ config._genome->IIDOf(currentSeq),
+ dsLow,
+ dsHigh - dsLow + config._merSize,
+ _qsIdx,
+ qsLow,
+ qsHigh - qsLow + config._merSize,
+ ILlength);
+ }
+ }
+
+ // All done with these hits. Move to the next set.
+ //
+ firstHit = lastHit;
+ }
+}
+
+
diff --git a/seatac/hitMatrix.H b/seatac/hitMatrix.H
new file mode 100644
index 0000000..430a9de
--- /dev/null
+++ b/seatac/hitMatrix.H
@@ -0,0 +1,112 @@
+#ifndef HITMATRIX_H
+#define HITMATRIX_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <new>
+
+#include "bio++.H"
+#include "positionDB.H"
+#include "filterObj.H"
+
+// Define this to cut the space required for storing hits by 1/3 --
+// from 12 byyes to 8 bytes -- at a slight computational expense --
+// negligible on real hardware, I hope.
+//
+// The original definition of diagonalID was
+// qsLen - qsPos - 1 + dsPos
+// but qsLen is fixed for everyone, so we could reduce it to
+// dsPos - qsPos
+// but that's not unsigned.
+//
+// Results: on a human mapping, using chromosomes as the stream and
+// the whole human as the table (so we need to actually store a large
+// number of hits), we see a savings of 2GB and a small drop in
+// runtime. Process size went from 20.7GB to 18.7GB, CPU time from
+// 20578 to 20193 seconds (833MHz EV6.8AL (21264B)).
+//
+#define WITHOUT_DIAGONALID
+
+
+struct diagonalLine {
+ uint32 _qsPos;
+ uint32 _dsPos;
+#ifndef WITHOUT_DIAGONALID
+ uint32 _diagonalID;
+#endif
+};
+
+
+class hitMatrix {
+public:
+ hitMatrix(uint32 qsLen,
+ uint32 qsIdx);
+ ~hitMatrix();
+
+ void addHits(uint32 qi,
+ uint64 *ps,
+ uint64 cn);
+
+ void sort_diagonal(void);
+ void sort_dsPos(void);
+
+ void processMatrix(char direction, filterObj *FO);
+
+private:
+ uint32 _qsLen; // Seq Len of Q
+ uint32 _qsIdx; // Index of Q in the FastA
+
+ // Instead of building the lines during add(), we store
+ // the information used to build lines, and then build them
+ // in chain(). This was done to reduce simultaneous memory
+ // usage, as the lineArrayMap and etc take up considerable space.
+ //
+ uint32 _hitsLen;
+ uint32 _hitsMax;
+ diagonalLine *_hits;
+};
+
+
+inline
+void
+hitMatrix::addHits(uint32 qi,
+ uint64 *ps,
+ uint64 cn) {
+
+ if ((_hitsLen + cn) >= _hitsMax) {
+ _hitsMax = _hitsMax + _hitsMax + (uint32)cn;
+
+ diagonalLine *h;
+ try {
+ h = new diagonalLine [_hitsMax];
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "hitMatrix::addHits()-- caught std::bad_alloc in %s at line %d.\n", __FILE__, __LINE__);
+ fprintf(stderr, "hitMatrix::addHits()-- have "uint32FMT" hits, tried to add "uint64FMT" more\n", _hitsLen, cn);
+ exit(1);
+ }
+
+ for (uint32 z=_hitsLen; z--; ) {
+ h[z]._qsPos = _hits[z]._qsPos;
+ h[z]._dsPos = _hits[z]._dsPos;
+#ifndef WITHOUT_DIAGONALID
+ h[z]._diagonalID = _hits[z]._diagonalID;
+#endif
+ }
+
+ delete [] _hits;
+
+ _hits = h;
+ }
+
+ for (uint64 i=0; i<cn; i++) {
+ _hits[_hitsLen]._qsPos = (uint32)(qi);
+ _hits[_hitsLen]._dsPos = (uint32)(ps[i]);
+#ifndef WITHOUT_DIAGONALID
+ _hits[_hitsLen]._diagonalID = (uint32)(_qsLen - qi - 1 + ps[i]);
+#endif
+ _hitsLen++;
+ }
+}
+
+
+#endif // HITMATRIX_H
diff --git a/seatac/posix.H b/seatac/posix.H
new file mode 100644
index 0000000..e69de29
diff --git a/seatac/seatac.C b/seatac/seatac.C
new file mode 100644
index 0000000..4a3d5ca
--- /dev/null
+++ b/seatac/seatac.C
@@ -0,0 +1,269 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <new>
+#include "seatac.H"
+
+
+// Shared data
+//
+configuration config;
+seqCache *qsFASTA = 0L;
+positionDB *positions = 0L;
+volatile uint32 numberOfQueries = 0;
+filterObj **output = 0L;
+pthread_mutex_t inputTailMutex;
+seqInCore **input = 0L;
+volatile uint32 inputHead = 0;
+volatile uint32 inputTail = 0;
+volatile uint32 outputPos = 0;
+char *threadStats[MAX_THREADS] = { 0L };
+
+
+
+
+
+#ifdef _AIX
+static
+void
+aix_new_handler() {
+ fprintf(stderr, "aix_new_handler()-- Memory allocation failed.\n");
+ throw std::bad_alloc();
+}
+#endif
+
+
+int
+main(int argc, char **argv) {
+
+#ifdef _AIX
+ // By default, AIX Visual Age C++ new() returns 0L; this turns on
+ // exceptions.
+ //
+ std::set_new_handler(aix_new_handler);
+#endif
+
+ // Read the configuration from the command line
+ //
+ if (argc < 2) {
+ config.usage(argv[0]);
+ exit(1);
+ }
+ config.read(argc, argv);
+
+ config._startTime = getTime();
+
+ // Open and init the query sequence
+ //
+ qsFASTA = new seqCache(config._qsFileName);
+
+ numberOfQueries = qsFASTA->getNumberOfSequences();
+ output = new filterObj * [numberOfQueries];
+ input = new seqInCore * [numberOfQueries];
+ inputHead = 0;
+ inputTail = 0;
+
+ for (uint32 i=numberOfQueries; i--; ) {
+ output[i] = 0L;
+ input[i] = 0L;
+ }
+
+ config._initTime = getTime();
+
+ config._genome = new seqStream(config._dbFileName);
+
+ // Create the chunk, returning a positionDB. Threads will use both
+ // chain and postions to build hitMatrices.
+ //
+ if ((config._tableFileName) && (fileExists(config._tableFileName))) {
+ if (config._tableBuildOnly) {
+ fprintf(stderr, "All done. Table '%s' already build.\n", config._tableFileName);
+ exit(0);
+ } else {
+ fprintf(stderr, "Loading positionDB state from '%s'\n", config._tableFileName);
+ positions = new positionDB(config._tableFileName, config._merSize, config._merSkip, 0);
+ }
+ } else {
+
+ existDB *maskDB = 0L;
+ if (config._maskFileName) {
+ if (config._beVerbose)
+ fprintf(stderr, "Building maskDB from '%s'\n", config._maskFileName);
+ maskDB = new existDB(config._maskFileName, config._merSize, existDBcanonical | existDBcompressHash | existDBcompressBuckets, 0, ~uint32ZERO);
+ }
+
+ existDB *onlyDB = 0L;
+ if (config._onlyFileName) {
+ if (config._beVerbose)
+ fprintf(stderr, "Building onlyDB from '%s'\n", config._onlyFileName);
+ onlyDB = new existDB(config._onlyFileName, config._merSize, existDBcanonical | existDBcompressHash | existDBcompressBuckets, 0, ~uint32ZERO);
+ }
+
+ merStream *MS = new merStream(new kMerBuilder(config._merSize),
+ config._genome,
+ true, false);
+
+ positions = new positionDB(MS, config._merSize, config._merSkip, maskDB, onlyDB, 0L, 0, 0, 0, 0, config._beVerbose);
+
+ delete MS;
+
+ delete maskDB;
+ delete onlyDB;
+
+ if (config._tableFileName) {
+ if (config._beVerbose)
+ fprintf(stderr, "Dumping positions table to '%s'\n", config._tableFileName);
+
+ positions->saveState(config._tableFileName);
+
+ if (config._tableBuildOnly)
+ exit(0);
+ }
+ }
+
+ config._buildTime = getTime();
+
+
+ //
+ // Initialize threads
+ //
+ pthread_attr_t threadAttr;
+ pthread_t threadID;
+
+ pthread_mutex_init(&inputTailMutex, NULL);
+
+ pthread_attr_init(&threadAttr);
+ pthread_attr_setscope(&threadAttr, PTHREAD_SCOPE_SYSTEM);
+ pthread_attr_setdetachstate(&threadAttr, PTHREAD_CREATE_DETACHED);
+ pthread_attr_setschedpolicy(&threadAttr, SCHED_OTHER);
+
+ // Start the deadlock detection threads
+ //
+#ifdef __alpha
+ fprintf(stderr, "Deadlock detection enabled!\n");
+ pthread_create(&threadID, &threadAttr, deadlockDetector, 0L);
+ pthread_create(&threadID, &threadAttr, deadlockChecker, 0L);
+#endif
+
+ // Start the loader thread
+ //
+ pthread_create(&threadID, &threadAttr, loaderThread, 0L);
+
+ // Start the search threads
+ //
+ for (uint32 i=0; i<config._numSearchThreads; i++) {
+ threadStats[i] = 0L;
+ pthread_create(&threadID, &threadAttr, searchThread, (void *)(unsigned long)i);
+ }
+
+
+ // Open output file
+ //
+ FILE *resultFILE = stdout;
+
+ if (config._outputFileName) {
+ errno = 0;
+ int rf = open(config._outputFileName,
+ O_WRONLY | O_LARGEFILE | O_CREAT | O_TRUNC,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno) {
+ fprintf(stderr, "Couldn't open the output file '%s'?\n%s\n", config._outputFileName, strerror(errno));
+ exit(1);
+ }
+
+ resultFILE = fdopen(rf, "w");
+ }
+
+
+ // Dump our information to the output file.
+ //
+ config.writeATACheader(resultFILE);
+
+
+
+ // Initialize the statistics collection object
+ //
+ statObj *stats = new statObj(config._filterObj, config._filteropts);
+
+ // Wait for threads to produce output
+ //
+ outputPos = 0;
+
+ // The match id of each output record.
+ //
+ uint64 matchID = 0;
+
+ double zeroTime = getTime() - 0.00000001;
+
+ while (outputPos < numberOfQueries) {
+ if (output[outputPos]) {
+ if (config._beVerbose && ((outputPos & 0x1ff) == 0x1ff)) {
+ fprintf(stderr,
+ "O:"uint32FMTW(7)" S:"uint32FMTW(7)" I:"uint32FMTW(7)" T:"uint32FMTW(7)" (%5.1f%%; %8.3f/sec) Finish in %5.2f seconds.\r",
+ outputPos,
+ inputTail,
+ inputHead,
+ numberOfQueries,
+ 100.0 * outputPos / numberOfQueries,
+ outputPos / (getTime() - zeroTime),
+ (numberOfQueries - outputPos) / (outputPos / (getTime() - zeroTime)));
+ fflush(stderr);
+ }
+
+ errno = 0;
+ matchID = output[outputPos]->output(resultFILE, matchID);
+ if (errno) {
+ fprintf(stderr, "Couldn't write to the output file '%s'.\n%d: %s\n",
+ config._outputFileName, errno, strerror(errno));
+ exit(1);
+ }
+
+ // Add this set of results to the statistics collector
+ //
+ stats->add(output[outputPos]);
+
+ //stats->show(stderr);
+
+ delete input[outputPos];
+ delete output[outputPos];
+
+ input[outputPos] = 0L;
+ output[outputPos] = 0L;
+
+ outputPos++;
+ } else {
+ nanosleep(&config._writerSleep, 0L);
+ }
+ }
+
+ if (config._beVerbose) {
+ fprintf(stderr, "\n"uint32FMTW(7)" sequences (%5.1f%%; %8.3f/sec) %5.2f seconds.\n",
+ numberOfQueries,
+ 100.0 * outputPos / numberOfQueries,
+ outputPos / (getTime() - zeroTime),
+ getTime() - zeroTime);
+ }
+
+ // Print statistics
+ //
+ stats->show(resultFILE);
+ delete stats;
+
+ errno = 0;
+ fclose(resultFILE);
+ if (errno)
+ fprintf(stderr, "Couldn't close to the output file '%s'.\n%s\n", config._outputFileName, strerror(errno));
+
+ config._searchTime = getTime();
+
+ // Clean up
+ //
+ delete positions;
+
+ pthread_attr_destroy(&threadAttr);
+ pthread_mutex_destroy(&inputTailMutex);
+
+ delete [] input;
+ delete [] output;
+
+ return(0);
+}
diff --git a/seatac/seatac.H b/seatac/seatac.H
new file mode 100644
index 0000000..a1efe74
--- /dev/null
+++ b/seatac/seatac.H
@@ -0,0 +1,168 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/utsname.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <math.h>
+
+// At one time, this was needed for pthread.h or semaphore.h
+typedef unsigned short ushort;
+
+#include <pthread.h>
+#include <semaphore.h>
+
+#include "bio++.H"
+
+#include "seqCache.H"
+
+#include "existDB.H"
+#include "positionDB.H"
+
+#include "hitMatrix.H"
+
+#include "filterObj.H"
+#include "statObj.H"
+
+class encodedQuery {
+private:
+ char const *_seq;
+ uint32 _seqLen;
+ uint32 _merSize;
+ bool _rc;
+
+ uint32 _seqPos;
+
+ int32 _timeUntilValid;
+
+ uint64 _substring;
+ uint64 _mermask;
+public:
+ encodedQuery(char const *seq,
+ uint32 seqLen,
+ uint32 k,
+ bool rc);
+ ~encodedQuery();
+
+ bool getMer(uint64 &mer, uint32 &pos);
+};
+
+
+
+
+//
+// A singleton for working with the command line parameters.
+//
+#define MAX_THREADS 64
+
+
+class configuration {
+public:
+ bool _beVerbose;
+
+ uint32 _merSize;
+ uint32 _merSkip;
+ uint32 _numSearchThreads;
+
+ bool _doReverse;
+ bool _doForward;
+
+ uint32 _maxDiagonal;
+ uint32 _maxGap;
+ uint32 _qsOverlap;
+ uint32 _dsOverlap;
+
+ uint32 _minLength;
+
+ char *_dbFileName;
+ char *_qsFileName;
+ char *_maskFileName;
+ char *_onlyFileName;
+ char *_outputFileName;
+ char *_statsFileName;
+
+ char *_tableFileName;
+ bool _tableBuildOnly;
+
+ seqStream *_genome;
+
+ // Filter parameters
+ //
+ char *_filtername;
+ char *_filteropts;
+ sharedObj *_filterObj;
+
+ // Wall clock times
+ //
+ double _startTime;
+ double _initTime;
+ double _buildTime;
+ double _searchTime;
+ double _totalTime;
+
+ // Loader parameters
+ //
+ uint32 _loaderHighWaterMark;
+ struct timespec _loaderSleep;
+ bool _loaderWarnings;
+
+ // Search parameters
+ //
+ struct timespec _searchSleep;
+
+ // Output parameters
+ //
+ uint32 _writerHighWaterMark;
+ struct timespec _writerSleep;
+ bool _writerWarnings;
+
+ configuration();
+ ~configuration();
+
+ void usage(char *name);
+ void read(int argc, char **argv);
+ void writeATACheader(FILE *out);
+
+ void setTime(struct timespec *ts, double t) {
+ ts->tv_sec = (time_t)floor(t);
+ ts->tv_nsec = (long)((t - ts->tv_sec) * 1e9);
+ };
+};
+
+
+
+
+
+
+
+// Shared data
+//
+extern configuration config;
+
+extern seqCache *qsFASTA; // Used exclusively by thr-loader.C
+
+extern positionDB *positions;
+
+extern volatile uint32 numberOfQueries;
+
+extern filterObj **output;
+
+extern pthread_mutex_t inputTailMutex;
+extern seqInCore **input;
+extern volatile uint32 inputHead;
+extern volatile uint32 inputTail;
+
+extern volatile uint32 outputPos;
+
+extern char *threadStats[MAX_THREADS];
+
+void *deadlockDetector(void *U);
+void *deadlockChecker(void *U);
+
+void *loaderThread(void *U);
+void *searchThread(void *U);
diff --git a/seatac/sharedObj.H b/seatac/sharedObj.H
new file mode 100644
index 0000000..000334d
--- /dev/null
+++ b/seatac/sharedObj.H
@@ -0,0 +1,72 @@
+#ifndef SHAREDOBJ_H
+#define SHAREDOBJ_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <errno.h>
+
+// A wrapper around a shared object.
+//
+// Responsible for opening, accessing and closing a shared object.
+//
+
+class sharedObj {
+public:
+
+ sharedObj(char *p) {
+ path = new char [strlen(p) + 1];
+ strcpy(path, p);
+
+ handle = dlopen(path, RTLD_NOW);
+ if (handle == 0L) {
+ fprintf(stderr, "ERROR: Failed to open shared object '%s'\n%s\n", path, dlerror());
+ exit(1);
+ }
+ };
+
+ ~sharedObj() {
+ if (dlclose(handle)) {
+ fprintf(stderr, "WARNING: Failed to close shared object '%s'\n%s\n", path, dlerror());
+ }
+ delete [] path;
+ };
+
+ bool exists(char *name) {
+ void *ptr = 0L;
+
+ errno = 0;
+ ptr = dlsym(handle, name);
+ if (errno) {
+ fprintf(stderr, "ERROR: Failed to find symbol '%s' in shared object '%s'\n%s\n", name, path, dlerror());
+ exit(1);
+ }
+
+ return(ptr != 0L);
+ }
+
+ void *get(char const *name) {
+ void *ptr = 0L;
+
+ errno = 0;
+ ptr = dlsym(handle, name);
+ if (errno) {
+ fprintf(stderr, "ERROR: Failed to find symbol '%s' in shared object '%s'\n%s\n", name, path, dlerror());
+ exit(1);
+ }
+ if (ptr == 0L) {
+ fprintf(stderr, "ERROR: Symbol '%s' not present in shared object '%s'\n", name, path);
+ exit(1);
+ }
+
+ return(ptr);
+ };
+
+private:
+ char *path;
+ void *handle;
+};
+
+
+#endif // SHAREDOBJ_H
diff --git a/seatac/statObj.H b/seatac/statObj.H
new file mode 100644
index 0000000..801fbe5
--- /dev/null
+++ b/seatac/statObj.H
@@ -0,0 +1,73 @@
+#ifndef STATOBJ_H
+#define STATOBJ_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sharedObj.H"
+
+
+class statObj {
+public:
+ statObj(sharedObj *so, char *soOpts);
+ ~statObj();
+
+ void add(filterObj *FO) {
+ if (soadd)
+ (*soadd)(handle, FO->handle);
+ };
+
+ void show(FILE *file) {
+ if (soshow)
+ (*soshow)(handle, file);
+ };
+
+private:
+ char *soOpts;
+ sharedObj *so;
+
+ void *handle;
+
+ void * (*soconstruct)(char *);
+ void (*sodestruct)(void *);
+ void (*soadd)(void *, void *);
+ void (*soshow)(void *, FILE *);
+};
+
+
+inline
+statObj::statObj(sharedObj *so_, char *op_) {
+ soOpts = 0L;
+ so = so_;
+
+ handle = 0L;
+
+ soconstruct = 0L;
+ soadd = 0L;
+ soshow = 0L;
+ sodestruct = 0L;
+
+ if (op_) {
+ soOpts = new char [strlen(op_) + 1];
+ strcpy(soOpts, op_);
+ }
+
+ if (so) {
+ soconstruct = (void* (*)(char*))so->get("constructStats");
+ sodestruct = (void (*)(void*))so->get("destructStats");
+ soadd = (void (*)(void*, void*))so->get("addStats");
+ soshow = (void (*)(void*, FILE*))so->get("showStats");
+
+ handle = (*soconstruct)(soOpts);
+ }
+}
+
+
+inline
+statObj::~statObj() {
+ if (sodestruct)
+ (*sodestruct)(handle);
+ delete [] soOpts;
+}
+
+
+#endif // STATOBJ_H
diff --git a/seatac/summarizeAtacStats.pl b/seatac/summarizeAtacStats.pl
new file mode 100644
index 0000000..dc91b19
--- /dev/null
+++ b/seatac/summarizeAtacStats.pl
@@ -0,0 +1,155 @@
+#!/usr/local/bin/perl
+
+
+
+
+# generates summary of the meryl min phase
+
+printf("\n%-30s %5d %5d %8d\n",
+ "meryl min", "user", "sys", "maxRSS");
+
+open(F, "ls min*stats |");
+while (<F>) {
+ chomp $_;
+ my $file = $_;
+
+ my $ut = 0;
+ my $st = 0;
+ my $mr = 0;
+ my $bt = 0;
+
+ my $tm = 0;
+ my $dm = 0;
+ my $um = 0;
+ my $b = 0;
+ my $p = 0;
+
+ open(G, "< $file");
+ while (<G>) {
+ if (m/userTime:\s+(\d+)/) {
+ $ut = $1;
+ }
+ if (m/systemTime:\s+(\d+)/) {
+ $st = $1;
+ }
+ if (m/maxrss:\s+(\d+)/) {
+ $mr = $1;
+ }
+ }
+ close(G);
+
+ printf("%-30s %5d %5d %8d\n",
+ $file, $ut, $st, $mr);
+}
+close(F);
+
+
+
+
+
+
+# generates summary of the build phase
+
+printf("\n%-30s %5s %5s %5s %8s %10s %10s %10s %8s %8s\n",
+ "seatac build", "user", "sys", "wall", "maxRSS", "totMer", "distinctMer", "uniqueMer", "bktSize", "posnSize");
+
+open(F, "ls *build*out |");
+while (<F>) {
+ chomp $_;
+ my $file = $_;
+
+ my $ut = 0;
+ my $st = 0;
+ my $mr = 0;
+ my $bt = 0;
+
+ my $tm = 0;
+ my $dm = 0;
+ my $um = 0;
+ my $b = 0;
+ my $p = 0;
+
+ open(G, "< $file");
+ while (<G>) {
+ if (m/userTime:\s+(\d+)/) {
+ $ut = $1;
+ }
+ if (m/systemTime:\s+(\d+)/) {
+ $st = $1;
+ }
+ if (m/maxrss:\s+(\d+)/) {
+ $mr = $1;
+ }
+ if (m/build:\s+(\d+)/) {
+ $bt = $1;
+ }
+
+
+ if (m/Found\s+(\d+)\s+total/) {
+ $tm = $1;
+ }
+ if (m/Found\s+(\d+)\s+distinct/) {
+ $dm = $1;
+ }
+ if (m/Found\s+(\d+)\s+unique/) {
+ $um = $1;
+ }
+ if (m/Allocated\s+(\d+)\s*KB\s+for\s+buckets/) {
+ $b = $1;
+ }
+ if (m/Allocated\s+(\d+)\s*KB\s+for\s+positions/) {
+ $p = $1;
+ }
+ }
+ close(G);
+
+ printf("%-30s %5d %5d %5d %8d %10d %10d %10d %8dKB %8dKB\n",
+ $file, $ut, $st, $bt, $mr, $tm, $dm, $um, $b, $p);
+}
+close(F);
+
+# generates summary of the search phase
+
+printf("\n%-50s %5s %5s %5s %5s %5s %9s %9s %10s\n",
+ "seatac search", "user", "sys", "build", "srch", "total", "usr/srch", "usr/totl", "maxRSS");
+
+open(F, "ls *segment*stats |");
+while (<F>) {
+ chomp $_;
+ my $file = $_;
+
+ my $ut = 0;
+ my $st = 0;
+ my $mr = 0;
+
+ my $btt = 0;
+ my $stt = 0;
+ my $ttt = 0;
+
+ open(G, "< $file");
+ while (<G>) {
+ if (m/userTime:\s+(\d+)/) {
+ $ut = $1;
+ }
+ if (m/systemTime:\s+(\d+)/) {
+ $st = $1;
+ }
+ if (m/maxrss:\s+(\d+)/) {
+ $mr = $1;
+ }
+ if (m/build:\s+(\d+)/) {
+ $btt = $1;
+ }
+ if (m/search:\s+(\d+)/) {
+ $stt = $1;
+ }
+ if (m/total:\s+(\d+)/) {
+ $ttt = $1;
+ }
+ }
+ close(G);
+
+ printf("%-50s %5d %5d %5d %5d %5d %9.6f %9.6f %10d\n",
+ $file, $ut, $st, $btt, $stt, $ttt, $ut / $stt, $ut / $ttt, $mr);
+}
+close(F);
diff --git a/seatac/thr-deadlock.C b/seatac/thr-deadlock.C
new file mode 100644
index 0000000..dc284f1
--- /dev/null
+++ b/seatac/thr-deadlock.C
@@ -0,0 +1,77 @@
+#include "seatac.H"
+
+#ifdef __alpha
+
+// Define this to kill the process with a vengance instead of
+// gracefully exiting. exit() tries to free memory, and is thus gets
+// caught in the deadlock -- but is useful for debugging.
+//
+#define KILL_INSTEAD_OF_EXIT
+
+#ifdef KILL_INSTEAD_OF_EXIT
+#include <signal.h>
+#endif
+
+uint32 deadlockTested = 0;
+uint32 deadlockPassed = 0;
+
+void*
+deadlockDetector(void *) {
+
+ fprintf(stderr, "Hello! I'm a deadlockDetector!\n");
+
+ detectAgain:
+
+ // Wait for the deadlock checker to reset things
+ //
+ while ((deadlockTested == 1) || (deadlockPassed == 1))
+ sleep(4);
+
+ deadlockTested = 1;
+ char *x = new char [16];
+ delete [] x;
+ deadlockPassed = 1;
+
+ goto detectAgain;
+
+ return(0L); // Ignore the warning!
+}
+
+void*
+deadlockChecker(void *) {
+
+ fprintf(stderr, "Hello! I'm a deadlockChecker!\n");
+
+ checkAgain:
+
+ // Wait for the tester to test
+ //
+ while (deadlockTested == 0)
+ sleep(5);
+
+ // Give it another ten seconds to return
+ //
+ sleep(5);
+
+ if (deadlockPassed == 0) {
+ fprintf(stderr, "\n\n\nESTmapper/search-- Deadlock detected! Aborting the process!\n\n");
+ fflush(stderr);
+#ifdef KILL_INSTEAD_OF_EXIT
+ kill(getpid(), SIGKILL);
+#endif
+ exit(1);
+ }
+
+ //fprintf(stderr, "Deadlock OK\n");
+
+ // Reset the testing/checking flags
+ //
+ deadlockPassed = 0;
+ deadlockTested = 0;
+
+ goto checkAgain;
+
+ return(0L); // Ignore the warning!
+}
+
+#endif
diff --git a/seatac/thr-loader.C b/seatac/thr-loader.C
new file mode 100644
index 0000000..efeca9a
--- /dev/null
+++ b/seatac/thr-loader.C
@@ -0,0 +1,81 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <new>
+
+#include "seatac.H"
+
+// Define this to print a message whenever a sequence is loaded.
+// Useful for testing the loader with large sequences (scaffolds,
+// chromosomes).
+//
+//#define VERBOSE_LOADER
+
+#ifdef TRUE64BIT
+char const *loadDesc = "WARNING: Loader ran dry. Increasing limit to %u sequences, decreasing sleep to %f.\n";
+#else
+char const *loadDesc = "WARNING: Loader ran dry. Increasing limit to %lu sequences, decreasing sleep to %f.\n";
+#endif
+
+void*
+loaderThread(void *) {
+ uint32 waterLevel = 0;
+ seqInCore *B = 0L;
+ bool slept = false;
+
+ while (inputHead < numberOfQueries) {
+
+ // We fill the input as fast as we can, up to the high water
+ // mark, then we take a little snooze to let the workers catch up.
+ //
+ pthread_mutex_lock(&inputTailMutex);
+ waterLevel = inputHead - inputTail;
+ pthread_mutex_unlock(&inputTailMutex);
+
+ // Warn if we're too small.
+ //
+ if ((slept) && (waterLevel <= 1)) {
+
+ uint32 i = (uint32) (0.1 * config._loaderHighWaterMark);
+ if (i == 0)
+ i = 1;
+ config._loaderHighWaterMark += i;
+
+ config.setTime(&config._loaderSleep,
+ 0.9 * ((double)config._loaderSleep.tv_sec + (double)config._loaderSleep.tv_nsec * 1e-9));
+
+ if (config._loaderWarnings)
+ fprintf(stderr, loadDesc,
+ config._loaderHighWaterMark,
+ ((double)config._loaderSleep.tv_sec + (double)config._loaderSleep.tv_nsec * 1e-9));
+ }
+
+ // Sleep, if we need to, otherwise, get the next sequence and
+ // push it onto the input list at inputHead. This alloc is
+ // deleted by the output thread.
+ //
+ if (waterLevel >= config._loaderHighWaterMark) {
+ slept = true;
+ nanosleep(&config._loaderSleep, 0L);
+ } else {
+ slept = false;
+
+#ifdef VERBOSE_LOADER
+ fprintf(stderr, "Loading sequence %u (tail = %u)\n", inputHead, inputTail);
+#endif
+
+ try {
+ B = qsFASTA->getSequenceInCore();
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "loaderThread()-- Failed to load next query sequence\ncaught bad_alloc in %s at line %d\n", __FILE__, __LINE__);
+ exit(1);
+ }
+
+ pthread_mutex_lock(&inputTailMutex);
+ input[inputHead] = B;
+ inputHead++;
+ pthread_mutex_unlock(&inputTailMutex);
+ }
+ }
+
+ return(0L);
+}
diff --git a/seatac/thr-search.C b/seatac/thr-search.C
new file mode 100644
index 0000000..cd9602f
--- /dev/null
+++ b/seatac/thr-search.C
@@ -0,0 +1,173 @@
+#include "seatac.H"
+
+char const *srchGbye = "[%ld] computed: "uint64FMTW(8)" blocked: "uint64FMTW(4)"/"uint64FMTW(4)" encodeTime: %7.2f searchTime: %7.2f processTime: %7.2f\n";
+
+class searcherState {
+public:
+ uint64 posnMax;
+ uint64 posnLen;
+ uint64 *posn;
+
+ double encodeTime;
+ double maskTime;
+ double searchTime;
+ double processTime;
+
+ searcherState() {
+ posnMax = 16384;
+ posnLen = 0;
+ posn = new uint64 [ posnMax ];
+
+ encodeTime = 0.0;
+ maskTime = 0.0;
+ searchTime = 0.0;
+ processTime = 0.0;
+ };
+
+ ~searcherState() {
+ delete [] posn;
+ };
+};
+
+
+void
+doSearch(searcherState *state,
+ seqInCore *seq,
+ uint32 idx,
+ bool rc,
+ filterObj *FO) {
+ encodedQuery *query = 0L;
+ hitMatrix *matrix = 0L;
+ double startTime = 0.0;
+ uint64 mer = uint64ZERO;
+ uint32 pos = uint32ZERO;
+ uint64 count = 0;
+
+ // Build and mask the query
+ //
+ startTime = getTime();
+ query = new encodedQuery(seq->sequence(), seq->sequenceLength(), config._merSize, rc);
+ state->encodeTime += getTime() - startTime;
+
+ // Get the hits
+ //
+ startTime = getTime();
+ matrix = new hitMatrix(seq->sequenceLength(), idx);
+
+ while (query->getMer(mer, pos) == true)
+ if (positions->getExact(mer, state->posn, state->posnMax, state->posnLen, count))
+ matrix->addHits(pos, state->posn, state->posnLen);
+
+ state->searchTime += getTime() - startTime;
+
+ // Begin processing
+ //
+ startTime = getTime();
+ matrix->processMatrix(rc ? 'r' : 'f', FO);
+ state->processTime += getTime() - startTime;
+
+ delete matrix;
+ delete query;
+}
+
+
+
+void*
+searchThread(void *U) {
+ uint32 idx = 0;
+ seqInCore *seq = 0L;
+ uint32 blockedI = 0;
+ uint32 blockedO = 0;
+ uint32 computed = 0;
+
+ searcherState *state = new searcherState;
+
+ // Allocate and fill out the thread stats -- this ensures that we
+ // always have stats (even if they're bogus).
+ //
+ threadStats[(long)U] = new char [1025];
+ sprintf(threadStats[(long)U], srchGbye,
+ (long)U,
+ (uint32)0, (uint32)0, (uint32)0,
+ 0.0, 0.0, 0.0);
+
+ while (inputTail < numberOfQueries) {
+
+ // Grab the next sequence.
+ //
+ pthread_mutex_lock(&inputTailMutex);
+ idx = inputTail;
+ if (idx < numberOfQueries) {
+ seq = input[idx];
+ input[idx] = 0L;
+ if (seq)
+ inputTail++;
+ }
+ pthread_mutex_unlock(&inputTailMutex);
+
+ // Still need to check that the index is valid. Another thread
+ // could (and does) steal execution between the while and the
+ // mutex lock.
+ //
+ if (idx < numberOfQueries) {
+
+ // If there is no sequence, oh boy, we are in bad shape. Sleep a
+ // little bit to let the loader catch up, then try again.
+ //
+ if (seq == 0L) {
+ //if (config._loaderWarnings)
+ // fprintf(stderr, "%lu Blocked by input.\n", (uint64)U);
+ blockedI++;
+ nanosleep(&config._searchSleep, 0L);
+ } else {
+
+ // If our idx is too far away from the output thread, sleep
+ // a little bit. We keep the idx and seq that we have obtained,
+ // though.
+ //
+ while (idx > (outputPos + config._writerHighWaterMark)) {
+ if (config._writerWarnings)
+ fprintf(stderr, uint64FMT" Blocked by output (idx = "uint32FMT", outputPos = "uint32FMT").\n", (long)U, idx, outputPos);
+ blockedO++;
+ nanosleep(&config._searchSleep, 0L);
+ }
+
+ // Construct a filter object
+ //
+ filterObj *FO = new filterObj(config._filterObj, config._filteropts);
+
+ // Do searches.
+ //
+ if (config._doForward)
+ doSearch(state, seq, idx, false, FO);
+ if (config._doReverse)
+ doSearch(state, seq, idx, true, FO);
+
+ // Do filtering.
+ //
+ FO->filter();
+
+ // Signal that we are done.
+ //
+ output[idx] = FO;
+ computed++;
+
+ delete seq;
+
+ } // end of seq != 0L
+ } // end of idx < numberOfQueries
+ } // end of inputTail < numberOfQueries
+
+
+ // OK, now fill out the read thread stats
+ //
+ sprintf(threadStats[(long)U], srchGbye, (long)U,
+ computed, blockedI, blockedO,
+ state->encodeTime,
+ state->searchTime,
+ state->processTime);
+
+ delete state;
+
+ return(0L);
+}
diff --git a/sim4db/Make.include b/sim4db/Make.include
new file mode 100644
index 0000000..8aef26a
--- /dev/null
+++ b/sim4db/Make.include
@@ -0,0 +1,17 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBSIM4/ :=$(realpath $/../libsim4/)/
+
+$/.CXX_SRCS := $/sim4th.C
+$/.CXX_EXES := $/sim4db
+
+$/.CLEAN := $/*.o
+
+$/sim4db: $/sim4th.o
+
+${$/.CXX_EXES}: ${LIBSIM4/}libsim4.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+$(eval $/%.d $/%.o: CXXFLAGS+= -I${LIBSIM4/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBUTL/})
diff --git a/sim4db/sim4th.C b/sim4db/sim4th.C
new file mode 100644
index 0000000..085bb4d
--- /dev/null
+++ b/sim4db/sim4th.C
@@ -0,0 +1,601 @@
+// This file is part of sim4db.
+// Copyright (c) 2005 Brian Walenz
+// Author: Brian Walenz
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received (LICENSE.txt) a copy of the GNU General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/resource.h>
+#include <sys/utsname.h>
+#include <signal.h>
+#include <math.h>
+
+#include <pthread.h>
+#include <semaphore.h>
+
+#include "bio++.H"
+#include "sim4.H"
+#include "sweatShop.H"
+
+// XXX Both loader and loaderAll leave the last gen sequence undeleted!
+
+readBuffer *scriptFile = 0L;
+
+seqCache *GENs = 0L;
+seqCache *ESTs = 0L;
+
+uint32 lastGENiid = ~uint32ZERO;
+uint32 lastESTiid = ~uint32ZERO;
+seqInCore *lastGENseq = 0L;
+
+int fOutput = 0;
+int fYesNo = 0;
+
+char *cdnaFileName = 0L;
+char *scriptFileName = 0L;
+char *databaseFileName = 0L;
+char *outputFileName = 0L;
+char *yesnoFileName = 0L;
+char *touchFileName = 0L;
+
+bool pairwise = false;
+
+bool beVerbose = false;
+bool beYesNo = false;
+
+uint32 numThreads = 2;
+uint32 loaderCacheSize = 1024;
+
+sim4parameters sim4params;
+
+// Parse the command line to create a sim4command object
+//
+// [-f|-r] -e ESTid -D GENid GENlo GENhi
+//
+// -f Forward only
+// -r Reverse only
+// -D genSeqIID genLo genHi
+// -e estSeqIID
+//
+//
+char*
+getNextScript(uint32 &ESTiid,
+ uint32 &GENiid, uint32 &GENlo, uint32 &GENhi,
+ bool &doForward,
+ bool &doReverse) {
+
+ char x = scriptFile->read();
+
+ // Skip any white space in the file
+ //
+ while ((scriptFile->eof() == false) && (whitespaceSymbol[x]))
+ x = scriptFile->read();
+
+ // Exit if we're all done.
+ //
+ if (scriptFile->eof())
+ return(0L);
+
+ uint32 linePos = 0;
+ uint32 lineMax = 128;
+ char *line = new char [lineMax];
+
+ // Copy the line from the readBuffer into our storage
+ //
+ while ((scriptFile->eof() == false) && (x != '\n')) {
+ line[linePos++] = x;
+ x = scriptFile->read();
+ }
+ line[linePos] = 0;
+
+ // Decode the line
+ //
+ uint32 argWords = 0;
+ splitToWords words(line);
+
+ while (words.getWord(argWords)) {
+ switch (words.getWord(argWords)[1]) {
+ case 'f':
+ doForward = true;
+ doReverse = false;
+ break;
+ case 'r':
+ doForward = false;
+ doReverse = true;
+ break;
+ case 'D':
+ GENiid = strtouint32(words.getWord(++argWords), 0L);
+ GENlo = strtouint32(words.getWord(++argWords), 0L);
+ GENhi = strtouint32(words.getWord(++argWords), 0L);
+ break;
+ case 'e':
+ ESTiid = strtouint32(words.getWord(++argWords), 0L);
+ break;
+ default:
+ //fprintf(stderr, "Unknown option '%s'\n", words.getWord(argWords));
+ break;
+ }
+
+ argWords++;
+ }
+
+ return(line);
+}
+
+
+
+
+class sim4thWork {
+public:
+ sim4command *input;
+ char *script;
+ sim4polishList *output;
+ seqInCore *gendelete;
+ seqInCore *estdelete;
+
+ sim4thWork() {
+ input = 0L;
+ script = 0L;
+ output = 0L;
+ gendelete = 0L;
+ estdelete = 0L;
+ };
+};
+
+
+void*
+loader(void *U) {
+ bool doForward = true;
+ bool doReverse = true;
+ uint32 ESTiid = 0;
+ uint32 GENiid = 0;
+ uint32 GENlo = 0;
+ uint32 GENhi = 0;
+
+ sim4thWork *p = new sim4thWork();
+
+ p->script = getNextScript(ESTiid, GENiid, GENlo, GENhi, doForward, doReverse);
+
+ if (p->script) {
+ seqInCore *ESTseq = 0L;
+ seqInCore *GENseq = 0L;
+
+ // If we already have the GENseq, use that, otherwise, register it for deletion.
+ //
+ if (lastGENiid == GENiid) {
+ GENseq = lastGENseq;
+ } else {
+
+ // Register it for deletion. Technically, we're deleting this
+ // on the state AFTER it's used, but we can't guarantee that
+ // that state is still around. The writer is deleting this, so
+ // by the time it gets here, it already wrote everyone that
+ // used this, which kind of implies that everyone that needs
+ // this is already computed.
+ //
+ p->gendelete = lastGENseq;
+
+ GENseq = GENs->getSequenceInCore(GENiid);
+
+ lastGENiid = GENiid;
+ lastGENseq = GENseq;
+ }
+
+ // The cache can, and does, overwrite the EST sequence we care
+ // about. For now, we just copy the EST from the cache.
+ //
+ ESTseq = ESTs->getSequenceInCore(ESTiid)->copy();
+ p->estdelete = ESTseq;
+
+ p->input = new sim4command(ESTseq, GENseq, GENlo, GENhi, doForward, doReverse);
+ } else {
+ delete p;
+ p = 0L;
+ }
+
+ return(p);
+}
+
+
+void*
+loaderPairwise(void *) {
+
+ // Align cDNA i to genomic i.
+
+ if (lastGENiid == ~uint32ZERO) // happens on the first time through
+ lastGENiid = 0;
+ if (lastESTiid == ~uint32ZERO) // happens on the first time through
+ lastESTiid = 0;
+
+ // If we've run out of sequences, we're done!
+ if ((lastGENiid >= GENs->getNumberOfSequences()) ||
+ (lastESTiid >= ESTs->getNumberOfSequences()))
+ return(0L);
+
+ sim4thWork *p = new sim4thWork();
+
+ // Grab the GEN sequence
+ p->gendelete = GENs->getSequenceInCore(lastGENiid++);
+
+ // Grab the EST sequence
+ p->estdelete = ESTs->getSequenceInCore(lastESTiid++)->copy();
+
+ // build the command
+ p->input = new sim4command(p->estdelete,
+ p->gendelete, 0, p->gendelete->sequenceLength(),
+ true, true);
+
+ return(p);
+}
+
+
+void*
+loaderAll(void *) {
+
+ sim4thWork *p = new sim4thWork();
+
+ // Previous implementations "Ping-pong'd" through the ESTs. The
+ // idea being we would use the cache on the ends. We can't easily
+ // do that here, so we always go forward.
+
+ // Flip around the end, if needed.
+ if (lastESTiid >= ESTs->getNumberOfSequences()) {
+ lastESTiid = 0;
+ p->gendelete = lastGENseq;
+ lastGENseq = 0L;
+
+ if (lastGENiid == ~uint32ZERO) // happens on the first time through
+ lastGENiid = 0;
+ else
+ lastGENiid++;
+ }
+
+ // If we've run out of sequences, we're done!
+ if (lastGENiid >= GENs->getNumberOfSequences()) {
+ delete p;
+ return(0L);
+ }
+
+ // Update the genomic sequence?
+ if (lastGENseq == 0L) {
+ lastGENseq = GENs->getSequenceInCore(lastGENiid);
+ }
+
+ // Grab the EST sequence
+ p->estdelete = ESTs->getSequenceInCore(lastESTiid++)->copy();
+
+ // build the command
+ p->input = new sim4command(p->estdelete,
+ lastGENseq, 0, lastGENseq->sequenceLength(),
+ true, true);
+
+ return(p);
+}
+
+
+
+
+void
+worker(void *U, void *T, void *S) {
+ sim4thWork *p = (sim4thWork *)S;
+
+ Sim4 *sim = new Sim4(&sim4params);
+ p->output = sim->run(p->input);
+ delete sim;
+}
+
+
+void
+writer(void *U, void *S) {
+ sim4thWork *p = (sim4thWork *)S;
+
+ sim4polishList &L4 = *(p->output);
+
+ for (uint32 i=0; L4[i]; i++) {
+ char *o = L4[i]->s4p_polishToString(sim4params.getOutputFormat());
+
+ errno = 0;
+ write(fOutput, o, strlen(o) * sizeof(char));
+ if (errno)
+ fprintf(stderr, "Couldn't write the output file '%s': %s\n", outputFileName, strerror(errno)), exit(1);
+
+ delete [] o;
+ }
+
+ if (yesnoFileName) {
+ char str[128];
+
+ if (L4[0])
+ sprintf(str, "%s -Y "uint32FMT" "uint32FMT"\n",
+ p->script, L4[0]->_percentIdentity, L4[0]->_querySeqIdentity);
+ else
+ sprintf(str, "%s -N 0 0\n", p->script);
+
+ write(fYesNo, str, strlen(str) * sizeof(char));
+ }
+
+ // Release this compute
+
+ delete p->input;
+ delete [] p->script;
+ delete p->output;
+ delete p->gendelete;
+ delete p->estdelete;
+ delete p;
+}
+
+
+
+int
+openOutputFile(char *name) {
+ int f = 0;
+
+ if (name == 0L)
+ return(0);
+
+ if (strcmp(name, "-") == 0) {
+ f = fileno(stdout);
+ } else {
+ errno = 0;
+ f = open(name,
+ O_WRONLY | O_LARGEFILE | O_CREAT | O_TRUNC,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno)
+ fprintf(stderr, "Couldn't open the output file '%s': %s\n", name, strerror(errno)), exit(1);
+ }
+ return(f);
+}
+
+
+
+int
+main(int argc, char **argv) {
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-alignments", 4) == 0) {
+ sim4params.setPrintAlignments(true);
+
+ } else if (strncmp(argv[arg], "-alwaysprint", 4) == 0) {
+ sim4params.setFindAllExons(true);
+ sim4params.setAlwaysReport(atoi(argv[++arg]));
+
+ } else if (strncmp(argv[arg], "-cdna", 3) == 0) {
+ cdnaFileName = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-cut", 3) == 0) {
+ double x = atof(argv[++arg]);
+ if (x < 0.0) {
+ fprintf(stderr, "WARNING: -cut adjusted to 0.0 (you gave %f)!\n", x);
+ x = 0.0;
+ }
+ if (x > 1.0) {
+ fprintf(stderr, "WARNING: -cut adjusted to 1.0 (you gave %f)!\n", x);
+ x = 1.0;
+ }
+ sim4params.setPolyTailPercent(x);
+
+ } else if (strncmp(argv[arg], "-genomic", 4) == 0) {
+ databaseFileName = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-minc", 5) == 0) {
+ sim4params.setFindAllExons(true);
+ sim4params.setMinCoverage(atoi(argv[++arg]) / 100.0);
+
+ } else if (strncmp(argv[arg], "-mini", 5) == 0) {
+ sim4params.setFindAllExons(true);
+ sim4params.setMinPercentExonIdentity(atoi(argv[++arg]));
+
+ } else if (strncmp(argv[arg], "-minl", 5) == 0) {
+ sim4params.setFindAllExons(true);
+ sim4params.setMinCoverageLength(atoi(argv[++arg]));
+
+ } else if (strncmp(argv[arg], "-nod", 4) == 0) {
+ sim4params.setIncludeDefLine(false);
+
+ } else if (strncmp(argv[arg], "-non", 4) == 0) {
+ sim4params.setDontForceCanonicalSplicing(true);
+
+ } else if (strncmp(argv[arg], "-f", 2) == 0) {
+ sim4params.setForceStrandPrediction(true);
+
+ } else if (strncmp(argv[arg], "-o", 2) == 0) {
+ outputFileName = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-po", 3) == 0) {
+ sim4params.setIgnorePolyTails(false);
+
+ } else if (strncmp(argv[arg], "-sc", 3) == 0) {
+ scriptFileName = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-sp", 3) == 0) {
+ sim4params.setSpliceModel(atoi(argv[++arg]));
+
+ } else if (strncmp(argv[arg], "-pa", 3) == 0) {
+ pairwise = true;
+
+ } else if (strncmp(argv[arg], "-to", 3) == 0) {
+ touchFileName = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-verbose", 2) == 0) {
+ beVerbose = true;
+
+ } else if (strncmp(argv[arg], "-YN", 3) == 0) {
+ yesnoFileName = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-threads", 3) == 0) {
+ numThreads = strtouint32(argv[++arg], 0L);
+
+ } else if (strncmp(argv[arg], "-H", 2) == 0) {
+ sim4params.setRelinkWeight(atoi(argv[++arg]));
+
+ } else if (strncmp(argv[arg], "-K", 2) == 0) {
+ sim4params.setMSPThreshold1(atoi(argv[++arg]));
+
+ } else if (strncmp(argv[arg], "-C", 2) == 0) {
+ sim4params.setMSPThreshold2(atoi(argv[++arg]));
+
+ } else if (strncmp(argv[arg], "-Z", 2) == 0) {
+ sim4params.setSpacedSeed(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-Ma", 3) == 0) {
+ sim4params.setMSPLimitAbsolute(atoi(argv[++arg]));
+
+ } else if (strncmp(argv[arg], "-Mp", 3) == 0) {
+ sim4params.setMSPLimitPercent(atof(argv[++arg]));
+
+ } else if (strncmp(argv[arg], "-interspecies", 2) == 0) {
+ sim4params.setInterspecies(true);
+
+ } else if (strcmp(argv[arg], "-gff3") == 0) {
+ sim4params.setOutputFormat(S4P_POLISH_GFF3);
+
+ } else {
+ fprintf(stderr, "Unknown option '%s'.\n", argv[arg]);
+ err++;
+ }
+
+ arg++;
+ }
+
+ if ((err) ||
+ (cdnaFileName == 0L) ||
+ (databaseFileName == 0L) ||
+ (outputFileName == 0L)) {
+ fprintf(stderr, "usage: %s -genomic g.fasta -cdna c.fasta -output o.sim4db [options]\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -v print status to stderr while running\n");
+ fprintf(stderr, " -V print script lines (stderr) as they are processed\n");
+ fprintf(stderr, " -YN print script lines (to given file) as they are processed, annotated with yes/no\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -cdna use these cDNA sequences\n");
+ fprintf(stderr, " -genomic use these genomic sequences\n");
+ fprintf(stderr, " -script use this script file\n");
+ fprintf(stderr, " -pairwise do pairs of sequences\n");
+ fprintf(stderr, " -output write output to this file\n");
+ fprintf(stderr, " -touch create this file when the program finishes execution\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -threads Use n threads.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -mincoverage iteratively find all exon models with the specified\n");
+ fprintf(stderr, " minimum PERCENT COVERAGE\n");
+ fprintf(stderr, " -minidentity iteratively find all exon models with the specified\n");
+ fprintf(stderr, " minimum PERCENT EXON IDENTITY\n");
+ fprintf(stderr, " -minlength iteratively find all exon models with the specified\n");
+ fprintf(stderr, " minimum ABSOLUTE COVERAGE (number of bp matched)\n");
+ fprintf(stderr, " -alwaysreport always report <number> exon models, even if they\n");
+ fprintf(stderr, " are below the quality thresholds\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " If no mincoverage or minidentity or minlength is given, only\n");
+ fprintf(stderr, " the best exon model is returned.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " You will probably want to specify ALL THREE of mincoverage,\n");
+ fprintf(stderr, " minidentity and minlength! Don't assume the default values\n");
+ fprintf(stderr, " are what you want!\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " You will DEFINITELY want to specify at least one of mincoverage,\n");
+ fprintf(stderr, " minidentity and minlength with alwaysreport! If you don't, mincoverage\n");
+ fprintf(stderr, " will be set to 90 and minidentity to 95 -- to reduce the number of\n");
+ fprintf(stderr, " spurious matches when a good match is found.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -nodeflines don't include the defline in the output\n");
+ fprintf(stderr, " -alignments print alignments\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -polytails DON'T mask poly-A and poly-T tails.\n");
+ fprintf(stderr, " -cut Trim marginal exons if A/T %% > x (poly-AT tails)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -noncanonical Don't force canonical splice sites\n");
+ fprintf(stderr, " -splicemodel Use the following splice model: 0 - original sim4;\n");
+ fprintf(stderr, " 1 - GeneSplicer; 2 - Glimmer (default: 0)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -forcestrand Force the strand prediction to always be\n");
+ fprintf(stderr, " 'forward' or 'reverse'\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -interspecies Use sim4cc for inter-species alignments\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " The following are for use only by immortals.\n");
+ fprintf(stderr, " -Z set the (spaced) seed pattern\n");
+ fprintf(stderr, " -H set the relink weight factor\n");
+ fprintf(stderr, " -K set the first MSP threshold\n");
+ fprintf(stderr, " -C set the second MSP threshold\n");
+ fprintf(stderr, " -Ma set the limit of the number of MSPs allowed\n");
+ fprintf(stderr, " -Mp same, as percentage of bases in cDNA\n");
+ fprintf(stderr, " NOTE: If used, both -Ma and -Mp must be specified!\n");
+ exit(1);
+ }
+
+ // Open input files
+ //
+ GENs = new seqCache(databaseFileName);
+ ESTs = new seqCache(cdnaFileName, loaderCacheSize, false);
+
+ // Open the output file
+ fOutput = openOutputFile(outputFileName);
+ fYesNo = openOutputFile(yesnoFileName);
+
+ sweatShop *ss = 0L;
+
+ err = sim4params.setSpliceMutex();
+ if (err) {
+ fprintf(stderr, "sim4th::main()-- Failed to initialize splice mutex: %s.\n", strerror(err));
+ exit(1);
+ }
+
+
+ // If we have a script, read work from there, otherwise,
+ // do an all-vs-all.
+ //
+ if (scriptFileName) {
+ scriptFile = new readBuffer(scriptFileName);
+ ss = new sweatShop(loader,
+ worker,
+ writer);
+ } else if (pairwise) {
+ ss = new sweatShop(loaderPairwise,
+ worker,
+ writer);
+ } else {
+ ss = new sweatShop(loaderAll,
+ worker,
+ writer);
+ }
+
+ ss->setNumberOfWorkers(numThreads);
+ ss->run(0L, beVerbose);
+
+ delete ss;
+
+ // Only close the file if it isn't stdout
+ //
+ if (strcmp(outputFileName, "-") != 0)
+ close(fOutput);
+
+ if (yesnoFileName)
+ close(fYesNo);
+
+ delete scriptFile;
+
+ delete ESTs;
+ delete GENs;
+
+ exit(0);
+}
diff --git a/sim4dbutils/LOG b/sim4dbutils/LOG
new file mode 100644
index 0000000..0ab9490
--- /dev/null
+++ b/sim4dbutils/LOG
@@ -0,0 +1,186 @@
+
+After trimming:
+
+dsc154p:/home/walenzbp/projects/sim4reader> perl ../../dbEST-20020331/intronstats.pl < trimmed.out
+
+int: 206
+sma: 195(First:16,Last:23,oneF:92,oneL:64,oneB:0)
+big 1319(First:330,Last:346,One:643)
+tot 4297418
+ff=4,fc=12 lf=12,lc=11
+
+
+sim4begin
+3529019[872-0-0] 1363[2586779-3425929] <538-0-96-forward-forward>
+edef=>CRA|162000089143028 /altid=gi|11947691 /dataset=dbest /taxon=9606 /org=Homo sapiens /date=12/21/2000 /altid=gb_acc|BF673796.1 /organ=prosta
+te /tissue_type= /length=872 /clone_end=5' /def=602135941F1 NIH_MGC_83 Homo sapiens cDNA clone IMAGE:4272299 5', mRNA sequence.
+ddef=>CRA|GA_x54KRE8RWM2:1..4526107 /organism=Homo sapiens /order=1 /ga_uid=181000064531106 /len=4526107
+1-53 (2001-2053) <53-0-100> ->
+54-136 (2322-2404) <83-0-100> ->
+137-281 (2499-2643) <144-0-99> ->
+282-337 (3107-3162) <54-0-96> ->
+338-507 (3337-3509) <167-0-95> ->
+508-553 (822566-822611) <37-0-77>
+gcttcttcctctttctcgactccatcttcgcggtagctgggaccgccgttcag
+gcttcttcctctttctcgactccatcttcgcggtagctgggaccgccgttcag
+tcgccaatatgcagctctttgtccgcgcccaggagctacacaccttcgaggtgaccggccaggaaacggtcgcccagatcaag
+tcgccaatatgcagctctttgtccgcgcccaggagctacacaccttcgaggtgaccggccaggaaacggtcgcccagatcaag
+gctcatgtagcctcactggagggcattgccccggaagatcaagtcgtgctcctggcaggcgcgcccctggaggatgaggccactctgggccagtgcggggtggaggccctgaTtaccctggaagtagcaggccgcatgcttggag
+gctcatgtagcctcactggagggcattgccccggaagatcaagtcgtgctcctggcaggcgcgcccctggaggatgaggccactctgggccagtgcggggtggaggccctgaCtaccctggaagtagcaggccgcatgcttggag
+gtaaagtccatggttccctggcccgtgcCTgaaaagtgagaggtcagactcctaag
+gtaaagtccatggttccctggcccgtgcTGgaaaagtgagaggtcagactcctaag
+gtggccaaacaggagaagaagaCgaagaagacaggtcgggctaagcgg-ggatgcagtacaaccggcgcttGtgtcaacgtGgtgcccacctttggcaagaagaagggccccaatgccaactcttaagtcttt-gtaattctggc
+tt-ctctaataaaaaagc-acttagttca
+gtggccaaacaggagaagaagaAgaagaagacaggtcgggctaagcggCggatgcagtacaaccggcgctt-tgtcaacgtTgtgcccacctttggcaagaagaagggccccaatgccaactcttaagtctttTgtaattctggc
+ttTctctaataaaaaagcCacttagttca
+Gcc-AAAAaaaaaaaaaaaaaaaaaaaaaagtggg-ggGgggCCgCga
+TccGTCTCaaaaaaaaaaaaaaaaaaaaaagtgggAggCgggA-g-ga
+sim4end
+
+
+
+On the latest run (with 85%, 10k filtering):
+
+int: 206
+sma: 147(First:10,Last:20,oneF:66,oneL:51,oneB:0)
+big 1184(First:316,Last:325,One:543)
+tot 4297418
+ff=3,fc=7 lf=10,lc=10
+
+
+
+
+There are still ~1300 matches with big introns. See:
+
+-rw-rw-r-- 1 walenz assembly 982915 Jun 24 18:22 big-exon-after-big-intron
+-rw-rw-r-- 1 walenz assembly 738089 Jun 24 18:22 big-exon-after-big-oneintron
+-rw-rw-r-- 1 walenz assembly 45228 Jun 24 18:22 sma-exon-after-big-intron
+-rw-rw-r-- 1 walenz assembly 161196 Jun 24 18:22 sma-exon-after-big-oneintron
+
+
+
+
+
+sim4begin
+3252894[607-0-0] 90[582719-1347738] <605-0-99-forward-forward>
+edef=>CRA|107000020413693 /altid=gi|9345515 /dataset=dbest /taxon=9606 /org=Homo sapiens /date=07/21
+/2000 /altid=gb_acc|BE409065.1 /organ=placenta /tissue_type=choriocarcinoma /length=607 /clone_end=5
+' /def=601301223F1 NIH_MGC_21 Homo sapiens cDNA clone IMAGE:3635909 5', mRNA sequence.
+ddef=>CRA|GA_x9V1BB6:1..4925599 /organism=Homo sapiens /order=1 /ga_uid=332442982 /len=4925599
+1-172 (2001-2172) <172-0-100> ->
+173-340 (756341-756508) <167-0-99> ->
+341-401 (758205-758265) <61-0-100> ->
+402-424 (758575-758597) <23-0-100> ->
+425-500 (759007-759082) <76-0-100> ->
+501-607 (762918-763025) <106-0-97>
+tgctgcctgtgtagttgcagccgcggccgcctcccgccagctcgcctcggggaacaggacgcgcgtgagctcaggcgtccccgccccagcttttctcgga
+accatgaaccccaactgcgcccggtgcggcaagatcgtgtatcccacggagaaggtgaactgtctggataag
+tgctgcctgtgtagttgcagccgcggccgcctcccgccagctcgcctcggggaacaggacgcgcgtgagctcaggcgtccccgccccagcttttctcgga
+accatgaaccccaactgcgcccggtgcggcaagatcgtgtatcccacggagaaggtgaactgtctggataag
+cccgccgcctgcgcgggggagcccagcacagaccgccgccgggaccccgagtcgcgcaccccagccccaccgGccaccccgcgcgccatggaccccaagg
+accgcaagaagatccagttctcggtgcccgcgccccctagccagctcgacccccgccaggtggagatg
+cccgccgcctgcgcgggggagcccagcacagaccgccgccgggaccccgagtcgcgcaccccagccccaccgCccaccccgcgcgccatggaccccaagg
+accgcaagaagatccagttctcggtgcccgcgccccctagccagctcgacccccgccaggtggagatg
+atccggcgcaggagaccaacgcctgccatgctgttccggctctcagagcactcctcaccag
+atccggcgcaggagaccaacgcctgccatgctgttccggctctcagagcactcctcaccag
+aggaggaagcctccccccaccag
+aggaggaagcctccccccaccag
+agagcctcaggagaggggcaccatctcaagtcgaagagacccaacccctgtgcctacacaccaccttcgctgaaag
+agagcctcaggagaggggcaccatctcaagtcgaagagacccaacccctgtgcctacacaccaccttcgctgaaag
+ctgtgcagcgcattgctgagtctcacctgcagtctatcagcaatttgaatgagaaccaggc-tcagaggaggaggatgagctgggggagcttcgggagct
+gg-ttatcA
+ctgtgcagcgcattgctgagtctcacctgcagtctatcagcaatttgaatgagaaccaggcCtcagaggaggaggatgagctgggggagcttcgggagct
+ggGttatc-
+sim4end
+
+
+sim4begin
+1618397[849-0-0] 1420[13169688-14270773] <765-0-98-complement-unknown>
+edef=>CRA|225000001589124 /altid=gi|15746938 /dataset=dbest /taxon=9606 /org=Homo sapiens /date=09/2
+5/2001 /altid=gb_acc|BI755360.1 /organ=brain /tissue_type= /length=849 /clone_end=5' /def=603024964F
+1 NIH_MGC_114 Homo sapiens cDNA clone IMAGE:5195750 5', mRNA sequence.
+ddef=>CRA|GA_x54KRE8WCJ9:1..15664065 /organism=Homo sapiens /order=1 /ga_uid=181000064676425 /len=15
+664065
+1-120 (1988-2108) <117-0-96> <-
+121-259 (2551-2688) <135-0-97> <-
+260-385 (94537-94662) <125-0-99> <-
+386-629 (222351-222595) <242-0-98> ==
+703-849 (1098940-1099085) <146-0-99>
+ctggtttcttcG-tgaaccactggaattcagccatggggactgcagaggcttcacagctcaggatgcccttctgCcGgactgaaacaccagtgttcttgg
+cttttgagatatagggaggat
+ctggtttcttcCTtgaaccactggaattcagccatggggactgcagaggcttcacagctcaggatgcccttctgAcCgactgaaacaccagtgttcttgg
+cttttgagatatagggaggat
+agttGacagtgatttGtGactttccgcacatcgggcgcagcgacatcgttcaaGgcgctgcattcgtactccccggactggtctcgcttgatgtcagaga
+tctccaggtactcatcctcacttacaaagccctggcctt
+agttTacagtgattt-tTactttccgcacatcgggcgcagcgacatcgttcaaCgcgctgcattcgtactccccggactggtctcgcttgatgtcagaga
+tctccaggtactcatcctcacttacaaagccctggcctt
+cGttgactgacaggtgtctccatgtcacagttggctctggtctgccaatagcaagacacagcagggtcacactgcttccctcattcacagtgatgtctga
+ggagatattcatgatctgaggaggaa
+cCttgactgacaggtgtctccatgtcacagttggctctggtctgccaatagcaagacacagcagggtcacactgcttccctcattcacagtgatgtctga
+ggagatattcatgatctgaggaggaa
+cttgcactattaggtgaacccgggacgttttgggatgattgtctgtctgcacagagcaggtgtacggaccttcgtcatacacatccacattttggatcat
+gatgctgtactgggttggtgtattgaccaggatgatcacacgagggtctatggaccacttgtcattcccagcgtagaggatggtgctgcggtttagccag
+gccacccgggttacccggtcatctatggtacacctg-agGgTggc
+cttgcactattaggtgaacccgggacgttttgggatgattgtctgtctgcacagagcaggtgtacggaccttcgtcatacacatccacattttggatcat
+gatgctgtactgggttggtgtattgaccaggatgatcacacgagggtctatggaccacttgtcattcccagcgtagaggatggtgctgcggtttagccag
+gccacccgggttacccggtcatctatggtacacctgCagTgAggc
+cctgggatgaagagcagggcagttgtcgccgagaagacgacccagtaggcaggatggtacatctcgacgctgcggtgctctcagCctgccgggcttgcta
+ctgcttctgctgctgctaccgctgctgccttcctctgtgctgaattc
+cctgggatgaagagcagggcagttgtcgccgagaagacgacccagtaggcaggatggtacatctcgacgctgcggtgctctcag-ctgccgggcttgcta
+ctgcttctgctgctgctaccgctgctgccttcctctgtgctgaattc
+sim4end
+
+
+sim4begin
+2694118[754-0-0] 1420[13169772-14270767] <550-0-97-complement-unknown>
+edef=>CRA|222000001431581 /altid=gi|15437350 /dataset=dbest /taxon=9606 /org=Homo sapiens /date=09/0
+5/2001 /altid=gb_acc|BI550038.1 /organ=brain /tissue_type=hippocampus /length=754 /clone_end=5' /def
+=603192502F1 NIH_MGC_95 Homo sapiens cDNA clone IMAGE:5263800 5', mRNA sequence.
+ddef=>CRA|GA_x54KRE8WCJ9:1..15664065 /organism=Homo sapiens /order=1 /ga_uid=181000064676425 /len=15
+664065
+1-30 (1994-2024) <30-0-96> <-
+31-166 (2467-2602) <133-0-97> ==
+286-536 (222259-222511) <246-0-97> ==
+610-754 (1098856-1099000) <141-0-97>
+gtgttc-tggcttttgagatatagggaggat
+gtgttcTtggcttttgagatatagggaggat
+aGgtttacagtgattttAacttCccgcacatcgggcgcagcgacatcgttcaacgcgctgcattcgtactccccggact-gtctcgcttgatgtcagaga
+tctccaggtactcatcctcacttacaaagccctggcc
+a-gtttacagtgattttTacttTccgcacatcgggcgcagcgacatcgttcaacgcgctgcattcgtactccccggactGgtctcgcttgatgtcagaga
+tctccaggtactcatcctcacttacaaagccctggcc
+ggaGGAa-cttgcactattaggtgaacccgggacgttttgggatgattgtctgtctgcacagagcaggtgtacggaccttcgtcatacacatccacattt
+tggatcatgatgctgtactgggttggtgtattgaccaggatgatcacacgagggtctatggaccacttgtcattcccagcgtagaggatggtgctgcggt
+ttagccaggccacccgggttacccggtcatctatggtacacctg-agGgTggc
+ggaCTTaCcttgcactattaggtgaacccgggacgttttgggatgattgtctgtctgcacagagcaggtgtacggaccttcgtcatacacatccacattt
+tggatcatgatgctgtactgggttggtgtattgaccaggatgatcacacgagggtctatggaccacttgtcattcccagcgtagaggatggtgctgcggt
+ttagccaggccacccgggttacccggtcatctatggtacacctgCagTgAggc
+cctgggatgaagagcagggcagttgtcgccgagaagacgacccagtaggcaggatggtacatctcgacgctgcggtgctctcagctgccgggcttgctac
+tgcttctgctgctgctaccgctgctgccttcctctgtgctCCGCt
+cctgggatgaagagcagggcagttgtcgccgagaagacgacccagtaggcaggatggtacatctcgacgctgcggtgctctcagctgccgggcttgctac
+tgcttctgctgctgctaccgctgctgccttcctctgtgctGAATt
+sim4end
+
+
+
+sim4begin
+134996[500-0-0] 1442[3243352-11655873] <385-0-96-complement-unknown>
+edef=>CRA|1000482720785 /altid=gi|4189471 /dataset=dbest /taxon=9606 /org=Homo sapiens /date=03/18/1
+999 /altid=gb_acc|AI379618.1 /organ=mixed (see below) /tissue_type=Pooled human melanocyte, fetal he
+art, and pregnant uterus /length=500 /clone_end=3' /def=tc58d12.x1 Soares_NhHMPu_S1 Homo sapiens cDN
+A clone IMAGE:2068823 3' similar to TR:Q13538 Q13538 ORF2: FUNCTION UNKNOWN. ;, mRNA sequence.
+ddef=>CRA|GA_x54KRE902N0:1..24267006 /organism=Homo sapiens /order=1 /ga_uid=181000064731840 /len=24
+267006
+27-82 (6826861-6826916) <50-0-89> ==
+162-205 (6826997-6827040) <40-0-88> <-
+206-500 (8410225-8410521) <295-0-99>
+tactcCtggtgaagatgctGCGaacattgttgaCatgaTaacaaaggatttagaat
+tactcTtggtgaagatgctATTaacattgttgaGatgaCaacaaaggatttagaat
+taaaatgctatcaaacagcaTcA-catActacagaAaaatctttc
+taaaatgctatcaaacagca-cTGcatGctacagaGaaatctttc
+atgaaaaagagtcaatcgattcaagctt-cattgttgcctttattttaagaaattaccacaaccaccccaaccttcagcaaccaccatcctgatcagtcc
+acaggcatcaacatggaccgaacaccctccaccagcaaaaagattagaacttgctgaaggcttagtttattgttagcattt-cttagcaacaaagtattt
+ttaataaaagtttttaatttaatgatttgtttgacataatgctattacacatttagtagactacagtatggtataagcagaacttttacatacatta
+atgaaaaagagtcaatcgattcaagcttCcattgttgcctttattttaagaaattaccacaaccaccccaaccttcagcaaccaccatcctgatcagtcc
+acaggcatcaacatggaccgaacaccctccaccagcaaaaagattagaacttgctgaaggcttagtttattgttagcatttCcttagcaacaaagtattt
+ttaataaaagtttttaatttaatgatttgtttgacataatgctattacacatttagtagactacagtatggtataagcagaacttttacatacatta
+sim4end
diff --git a/sim4dbutils/Make.include b/sim4dbutils/Make.include
new file mode 100644
index 0000000..e5c6876
--- /dev/null
+++ b/sim4dbutils/Make.include
@@ -0,0 +1,120 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBSIM4/ :=$(realpath $/../libsim4/)/
+
+src := $/cleanPolishes.C \
+ $/fixPolishesIID.C \
+ $/comparePolishes.C \
+ $/convertToAtac.C \
+ $/convertToExtent.C \
+ $/convertPolishes.C \
+ $/detectChimera.C \
+ $/depthOfPolishes.C \
+ $/filterPolishes.C \
+ $/headPolishes.C \
+ $/mappedCoverage.C \
+ $/mergePolishes.C \
+ $/parseSNP.C \
+ $/pickBestPolish.C \
+ $/pickBestPair.C \
+ $/pickUniquePolish.C \
+ $/plotCoverageVsIdentity.C \
+ $/removeDuplicate.C \
+ $/sortPolishes.C \
+ $/summarizePolishes.C \
+ $/uniqPolishes.C \
+ $/vennPolishes.C \
+ $/realignPolishes.C \
+ $/removeRedundant.C \
+ $/reportAlignmentDifferences.C \
+ $/s4p_overlap.C
+
+$/.C_SRCS :=${filter %.c,${src}}
+$/.CXX_SRCS :=${filter %.C,${src}}
+
+obj_c := ${$/.C_SRCS:.c=.o}
+obj_C := ${$/.CXX_SRCS:.C=.o}
+
+# always using c++ to link
+$/.CXX_EXES := $/cleanPolishes \
+ $/fixPolishesIID \
+ $/comparePolishes \
+ $/convertToAtac \
+ $/convertToExtent \
+ $/convertPolishes \
+ $/detectChimera \
+ $/depthOfPolishes \
+ $/filterPolishes \
+ $/headPolishes \
+ $/mappedCoverage \
+ $/mergePolishes \
+ $/parseSNP \
+ $/pickBestPolish \
+ $/pickBestPair \
+ $/pickUniquePolish \
+ $/plotCoverageVsIdentity \
+ $/removeDuplicate \
+ $/sortPolishes \
+ $/summarizePolishes \
+ $/uniqPolishes \
+ $/vennPolishes \
+ $/realignPolishes \
+ $/removeRedundant \
+ $/reportAlignmentDifferences
+
+$/.CLEAN := $/*.o
+
+$(eval $/%.d $/%.o: CFLAGS+=-I${LIBUTL/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBSIM4/})
+$(eval $/%.d $/%.o: CXXFLAGS+=-I${LIBUTL/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBSIM4/})
+
+$/filterPolishes : $/filterPolishes.o
+$/headPolishes : $/headPolishes.o
+$/mappedCoverage : $/mappedCoverage.o
+$/mergePolishes : $/mergePolishes.o
+$/sortPolishes : $/sortPolishes.o
+$/pickBestPolish : $/pickBestPolish.o
+$/pickBestPair : $/pickBestPair.o
+$/pickUniquePolish : $/pickUniquePolish.o
+$/cleanPolishes : $/cleanPolishes.o
+$/fixPolishesIID : $/fixPolishesIID.o
+$/plotIntronSize : $/plotIntronSize.o
+$/plotCoverageVsIdentity : $/plotCoverageVsIdentity.o
+$/parseSNP : $/parseSNP.o
+$/comparePolishes : $/comparePolishes.o $/s4p_overlap.o
+$/convertToAtac : $/convertToAtac.o
+$/convertToExtent : $/convertToExtent.o
+$/convertPolishes : $/convertPolishes.o
+$/depthOfPolishes : $/depthOfPolishes.o
+$/detectChimera : $/detectChimera.o
+$/trimSequencesBasedOnMatches : $/trimSequencesBasedOnMatches.o
+$/uniqPolishes : $/uniqPolishes.o
+$/summarizePolishes : $/summarizePolishes.o
+$/removeDuplicate : $/removeDuplicate.o
+$/vennPolishes : $/vennPolishes.o
+$/realignPolishes : $/realignPolishes.o
+$/removeRedundant : $/removeRedundant.o $/s4p_overlap.o
+$/reportAlignmentDifferences : $/reportAlignmentDifferences.o
+
+# Yeah, not everyone needs all these libraries. Live with it.
+#
+${$/.C_EXES} ${$/.CXX_EXES}: ${LIBSIM4/}libsim4.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+
+# Run the test cases for parseSNP
+$/.parseSNP-test: $/parseSNP ${SIM4DB/}sim4db
+ @ echo See page 85 in Big Bad Bri IR#2 for notes on parseSNP-test.
+ @ t=`dirname $<`/test ${SIM4DB/}sim4db -align -cdna $t/parsesnp-snp.fasta -genomic $t/parsesnp-gen.fasta -o $t/parsesnp-sim4.out
+ @ t=`dirname $<`/test $< -O $t/parsesnp-good -F $t/parsesnp-fail < $t/parsesnp-sim4.out
+ @-t=`dirname $<`/test diff $t/parsesnp-good $t/parsesnp-correct-parsed > $t/parsesnp-diffs
+ @ t=`dirname $<`/test; \
+ if test -s $t/parsesnp-diffs ; then \
+ echo "parseSNP tests FAILED" ; \
+ cat $t/parsesnp-diffs ; \
+ exit 13 ; \
+ else \
+ echo "parseSNP tests passed" ; \
+ t=`dirname $<` rm -f $t/parsesnp-good $t/parsesnp-fail $t/parsesnp-sim4.out $t/parsesnp-diffs $t/parsesnp-gen.fastaidx $t/parsesnp-snp.fastaidx; \
+ fi
+
diff --git a/sim4dbutils/README b/sim4dbutils/README
new file mode 100644
index 0000000..db1955c
--- /dev/null
+++ b/sim4dbutils/README
@@ -0,0 +1,25 @@
+Four sim4 related utilities:
+
+------------------------------------------------------------
+filterPolishes.c
+ Filters polishes by percent identity and composite. Writes output to
+ stdout or a file, takes input from stdin.
+
+------------------------------------------------------------
+mergePolishes.C
+ Merges multiple sim4db outputs. The output must be from the
+ same genomic file, with different cDNA files.
+
+------------------------------------------------------------
+pickBestPolish.c
+ Picks the best polish. Input is stdin, output is stdout. No options.
+
+------------------------------------------------------------
+sortPolishes.c
+ Sorts polishes by ESTid or GENid. Input is stdin, output is stdout.
+ Takes "-n N" to set the maximum number of polishes in the input.
+
+------------------------------------------------------------
+stripPolishes.c
+ Removes deflines and alignments from a polish file.
+
diff --git a/sim4dbutils/cleanPolishes-20020626.C b/sim4dbutils/cleanPolishes-20020626.C
new file mode 100644
index 0000000..408ccbf
--- /dev/null
+++ b/sim4dbutils/cleanPolishes-20020626.C
@@ -0,0 +1,302 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "sim4reader.h"
+
+#define SHOWTRIMMING
+
+char const *usage =
+"usage: %s [-save splitFile] [-threshold t]\n"
+" -threshold Introns bigger than this are split into two matches (default = 150000).\n"
+" -savesplits Saves a before/after of each split match.\n"
+" All matches are printed to stdout (untrimmed and trimmed).\n"
+"\n";
+
+
+
+bool
+lowComplexityExon(char *s) {
+ int cnt[5][5] = {0};
+ int map[256] = {0};
+ int i, j, len = 0;
+ int a=0, b=0, c=0;
+ double qual = 0.0;
+
+ if (s == 0L)
+ return(false);
+
+ map['A'] = map['a'] = 1;
+ map['C'] = map['c'] = 2;
+ map['G'] = map['g'] = 3;
+ map['T'] = map['t'] = 4;
+
+ for (i=0; i<5; i++)
+ for (j=0; j<5; j++)
+ cnt[i][j] = 0;
+
+ for (i=0, j=1; s[j]; i++, j++) {
+ cnt[map[s[i]]][map[s[j]]]++;
+ len++;
+ }
+
+ for (i=0; i<5; i++) {
+ for (j=0; j<5; j++) {
+ if (a < cnt[i][j]) {
+ c = b;
+ b = a;
+ a = cnt[i][j];
+ } else if (b < cnt[i][j]) {
+ c = b;
+ b = cnt[i][j];
+ } else if (c < cnt[i][j]) {
+ c = cnt[i][j];
+ }
+ }
+ }
+
+ qual = (double)(a+b+c) / (double)(len);
+
+ if (len > 50)
+ qual = 0.0;
+
+ //if (qual > 0.75)
+ //fprintf(stdout, "%8.5f:\t%s\n", qual, s);
+
+ return(qual > 0.75);
+}
+
+
+
+
+int
+main(int argc, char ** argv) {
+ int arg = 1;
+ FILE *splitFile = 0L;
+ int intronLimit = 150000;
+ sim4polish *p;
+
+#if 0
+ if (isatty(fileno(stdin)) || isatty(fileno(stdout))) {
+ fprintf(stderr, usage, argv[0]);
+
+ if (isatty(fileno(stdin)))
+ fprintf(stderr, "error: I cannot read polishes from the terminal!\n\n");
+
+ if (isatty(fileno(stdout)))
+ fprintf(stderr, "error: Please redirect the polishes to a file.\n (They are on stdout)\n\n");
+
+ exit(1);
+ }
+#endif
+
+ arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-savesplits", 2) == 0) {
+ arg++;
+ errno=0;
+ splitFile = fopen(argv[arg], "w");
+ if (errno) {
+ fprintf(stderr, "Can't open '%s' for writing\n%s\n", argv[arg], strerror(errno));
+ exit(1);
+ }
+ } else if (strncmp(argv[arg], "-threshold", 2) == 0) {
+ intronLimit = atoi(argv[++arg]);
+ }
+
+ arg++;
+ }
+
+
+ // Statistics on the splitting quality / frequency
+ int totMatches = 0;
+ int oneExon = 0;
+ int smaIntron = 0;
+ int junkFirst = 0;
+ int junkLast = 0;
+ int junkBoth = 0;
+ int splitOnGap = 0;
+ int goodQual = 0;
+ int flanking = 0;
+
+
+ FILE *junkF = fopen("spl.junkfirst", "w");
+ FILE *junkL = fopen("spl.junklast", "w");
+ FILE *junkB = fopen("spl.junkboth", "w");
+ FILE *splGap = fopen("spl.splitGap", "w");
+ FILE *good = fopen("spl.good", "w");
+ FILE *flank = fopen("spl.flanking", "w");
+
+
+
+ while ((p = readPolish(stdin)) != 0L) {
+ int exA;
+ int exB;
+
+
+ if (p->numExons == 1) {
+ oneExon++;
+ } else {
+
+ // Find the big intron. We assume there is only one big intron.
+ //
+ int biggestIntron = 0;
+ int intronSplit = 0;
+ int intronOri = 0;
+
+ for (exA=0, exB=1; exB < p->numExons; exA++, exB++) {
+ int dist = p->exons[exB].genFrom - p->exons[exA].genTo + 1;
+ if (dist > biggestIntron) {
+ biggestIntron = dist;
+ intronSplit = exB;
+ intronOri = p->exons[exA].intronOrientation;
+ }
+ }
+
+ if (intronOri == 0) {
+ fprintf(stderr, "didn't find the largest intron? (got zero)?\n");
+ exit(1);
+ }
+
+ if (intronOri == INTRON_NONE) {
+ fprintf(stderr, "biggest intron isn't an intron? (got none)?\n");
+ exit(1);
+ }
+
+ if (biggestIntron < 100000) {
+ smaIntron++;
+ } else {
+
+
+ // Declare the split obvious if all exons on either side are
+ // below 30bp, difficult otherwise.
+ //
+ bool killFirst = true;
+ bool killLast = true;
+
+ for (int i=0; i<intronSplit; i++)
+ if ((p->exons[i].estTo - p->exons[i].estFrom + 1 >= 50) &&
+ (p->exons[i].percentIdentity >= 88) &&
+ (lowComplexityExon(p->exons[i].estAlignment) == false))
+ killFirst = false;
+
+ for (int i=intronSplit; i<p->numExons; i++)
+ if ((p->exons[i].estTo - p->exons[i].estFrom + 1 >= 50) &&
+ (p->exons[i].percentIdentity >= 88) &&
+ (lowComplexityExon(p->exons[i].estAlignment) == false))
+ killLast = false;
+
+
+ // We shouldn't ever want to kill both sides.
+ //
+ if ((killFirst == true) && (killLast == true)) {
+ junkBoth++;
+ fprintf(junkB, "==============================JUNK FIRST AND LAST?\n");
+ printPolish(junkB, p);
+ }
+
+ if ((killFirst == true) && (killLast == false)) {
+ junkFirst++;
+ printPolish(junkF, p);
+ fprintf(junkF, "==============================\n");
+ }
+
+ if ((killFirst == false) && (killLast == true)) {
+ junkLast++;
+ printPolish(junkL, p);
+ fprintf(junkL, "==============================\n");
+ }
+
+ if ((killFirst == false) && (killLast == false)) {
+ if (intronOri == INTRON_GAP) {
+ splitOnGap++;
+ printPolish(splGap, p);
+ fprintf(splGap, "==============================\n");
+ } else {
+
+ // If there is a valid strand prediction and
+ // a) all exons >= 90%
+ // b) all exons >= 95%
+ // c) all exons >= 95%, except first and last, which can be >= 90%
+ // save the match as is.
+ //
+ bool validStrand = false;
+ if ((p->strandOrientation == STRAND_POSITIVE) ||
+ (p->strandOrientation == STRAND_NEGATIVE))
+ validStrand = true;
+
+#if 0
+ bool qualIsA = true;
+ for (exA=0; exA < p->numExons; exA++)
+ if (p->exons[exA].percentIdentity < 90)
+ qualIsA = false;
+
+ bool qualIsB = true;
+ for (exA=0; exA < p->numExons; exA++)
+ if (p->exons[exA].percentIdentity < 95)
+ qualIsB = false;
+#endif
+
+ bool qualIsC = true;
+ if (p->exons[0].percentIdentity < 90)
+ qualIsC = false;
+ if (p->exons[p->numExons-1].percentIdentity < 90)
+ qualIsC = false;
+ for (exA=1; exA < p->numExons-1; exA++)
+ if (p->exons[exA].percentIdentity < 95)
+ qualIsC = false;
+
+ // If the match looks good, but just has a large intron, keep it.
+ //
+ if (validStrand && qualIsC) {
+ printPolish(good, p);
+ fprintf(good, "==============================\n");
+ goodQual++;
+ } else {
+ flanking++;
+ printPolish(flank, p);
+ fprintf(flank, "==============================\n");
+ }
+ }
+ }
+
+ } // Has a big intron
+ } // More than one exon
+
+ totMatches++;
+ if ((totMatches % 3759) == 0) {
+ fprintf(stderr, "tot: %7d ", totMatches);
+ fprintf(stderr, "one: %7d ", oneExon);
+ fprintf(stderr, "sma: %7d ", smaIntron);
+ fprintf(stderr, "jnkF: %7d ", junkFirst);
+ fprintf(stderr, "jnkL: %7d ", junkLast);
+ fprintf(stderr, "jnkB: %7d ", junkBoth);
+ fprintf(stderr, "onGap: %7d ", splitOnGap);
+ fprintf(stderr, "good: %7d ", goodQual);
+ fprintf(stderr, "flank: %7d\r", flanking);
+ }
+
+ destroyPolish(p);
+ }
+
+ fclose(junkF);
+ fclose(junkL);
+ fclose(junkB);
+ fclose(splGap);
+ fclose(good);
+ fclose(flank);
+
+ fprintf(stderr, "tot: %7d ", totMatches);
+ fprintf(stderr, "one: %7d ", oneExon);
+ fprintf(stderr, "sma: %7d ", smaIntron);
+ fprintf(stderr, "jnkF: %7d ", junkFirst);
+ fprintf(stderr, "jnkL: %7d ", junkLast);
+ fprintf(stderr, "jnkB: %7d ", junkBoth);
+ fprintf(stderr, "onGap: %7d ", splitOnGap);
+ fprintf(stderr, "good: %7d ", goodQual);
+ fprintf(stderr, "flank: %7d\n", flanking);
+
+ return(0);
+}
diff --git a/sim4dbutils/cleanPolishes-experiments/dbEST-intronSize-histogram b/sim4dbutils/cleanPolishes-experiments/dbEST-intronSize-histogram
new file mode 100644
index 0000000..2055714
--- /dev/null
+++ b/sim4dbutils/cleanPolishes-experiments/dbEST-intronSize-histogram
@@ -0,0 +1,45484 @@
+3326570
+1192472
+569706
+381713
+233305
+156017
+130697
+91773
+65684
+54975
+48528
+36169
+34014
+30591
+21371
+22180
+17177
+18572
+14307
+11074
+9758
+9740
+10932
+7420
+9510
+6956
+7878
+7106
+5811
+4556
+6366
+5035
+4344
+4265
+5352
+5675
+4935
+4286
+3514
+3827
+2544
+2666
+2852
+2739
+1453
+1793
+2416
+1649
+1588
+2023
+1183
+1777
+1338
+1416
+1517
+1087
+1330
+1482
+1153
+1325
+966
+825
+1003
+810
+972
+988
+677
+1127
+1394
+1335
+884
+897
+439
+407
+772
+1037
+746
+405
+425
+620
+919
+305
+458
+610
+692
+764
+563
+328
+454
+695
+327
+367
+342
+671
+415
+590
+1137
+263
+175
+329
+459
+307
+203
+125
+533
+153
+246
+290
+136
+100
+246
+150
+194
+132
+113
+115
+325
+253
+186
+140
+185
+103
+110
+59
+157
+72
+131
+166
+201
+148
+76
+66
+46
+134
+208
+38
+38
+34
+72
+98
+37
+69
+53
+30
+81
+90
+80
+44
+108
+40
+65
+239
+24
+101
+41
+122
+136
+123
+63
+60
+58
+50
+20
+64
+38
+100
+52
+46
+20
+50
+36
+113
+30
+46
+54
+20
+3
+80
+15
+44
+32
+11
+9
+40
+18
+52
+26
+64
+16
+24
+28
+12
+6
+10
+14
+13
+26
+12
+94
+7
+30
+12
+8
+14
+10
+15
+18
+10
+6
+4
+40
+6
+20
+10
+6
+18
+26
+4
+2
+16
+14
+9
+20
+58
+14
+51
+18
+14
+4
+12
+26
+32
+18
+22
+8
+6
+4
+8
+2
+8
+26
+14
+10
+4
+12
+6
+16
+6
+4
+6
+2
+0
+8
+4
+16
+20
+2
+20
+9
+2
+2
+4
+4
+2
+32
+16
+4
+2
+4
+10
+8
+16
+36
+2
+0
+2
+8
+0
+2
+2
+0
+6
+52
+34
+24
+9
+1
+0
+16
+4
+6
+2
+0
+2
+10
+0
+2
+8
+12
+12
+92
+0
+2
+8
+2
+12
+2
+6
+0
+0
+4
+0
+2
+10
+2
+0
+2
+2
+12
+2
+0
+0
+2
+0
+0
+7
+0
+4
+80
+10
+0
+0
+2
+6
+2
+2
+4
+2
+2
+0
+2
+0
+2
+0
+0
+2
+2
+10
+0
+0
+2
+4
+2
+4
+0
+6
+0
+0
+14
+0
+2
+2
+4
+4
+0
+2
+16
+2
+6
+2
+2
+4
+4
+2
+0
+2
+4
+0
+0
+4
+4
+4
+0
+2
+2
+2
+2
+0
+0
+0
+0
+2
+72
+0
+4
+2
+2
+0
+0
+0
+0
+0
+2
+6
+0
+2
+6
+2
+0
+10
+2
+0
+0
+2
+2
+0
+0
+2
+0
+4
+0
+0
+0
+0
+0
+0
+0
+4
+2
+0
+2
+0
+0
+0
+2
+0
+0
+0
+0
+4
+4
+0
+0
+0
+0
+2
+0
+0
+0
+0
+6
+2
+0
+0
+0
+0
+0
+0
+2
+0
+4
+2
+0
+0
+8
+0
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+2
+0
+0
+2
+6
+2
+0
+2
+2
+0
+0
+4
+0
+0
+2
+0
+0
+0
+2
+2
+6
+1
+0
+4
+2
+2
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+2
+2
+0
+14
+0
+8
+8
+0
+4
+2
+0
+0
+0
+2
+0
+2
+0
+0
+0
+0
+0
+12
+0
+0
+4
+0
+2
+0
+0
+2
+0
+0
+0
+2
+4
+0
+0
+0
+0
+4
+6
+0
+0
+0
+0
+2
+0
+2
+6
+0
+2
+0
+2
+2
+2
+2
+0
+2
+0
+0
+2
+2
+0
+4
+0
+0
+2
+0
+0
+0
+0
+4
+0
+0
+4
+2
+2
+0
+0
+6
+0
+0
+0
+0
+0
+0
+2
+2
+0
+2
+0
+0
+0
+0
+2
+6
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+14
+0
+0
+2
+0
+0
+4
+0
+2
+2
+0
+0
+0
+0
+0
+0
+0
+12
+0
+0
+2
+2
+4
+0
+0
+0
+2
+0
+0
+0
+2
+0
+6
+0
+0
+0
+0
+10
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+8
+2
+0
+2
+2
+0
+0
+4
+0
+0
+2
+2
+0
+2
+0
+0
+0
+2
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+4
+2
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+4
+0
+0
+0
+2
+0
+0
+0
+4
+0
+0
+0
+0
+2
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+2
+0
+4
+0
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+6
+0
+0
+0
+0
+0
+0
+2
+2
+0
+0
+0
+0
+0
+0
+2
+0
+2
+4
+0
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+4
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+18
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+2
+2
+0
+4
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+2
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+2
+0
+0
+2
+0
+2
+0
+0
+0
+0
+2
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+2
+0
+4
+0
+0
+0
+0
+2
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+2
+4
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+20
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+20
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+6
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+2
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+4
diff --git a/sim4dbutils/cleanPolishes-experiments/evalThresh-gnuplot b/sim4dbutils/cleanPolishes-experiments/evalThresh-gnuplot
new file mode 100644
index 0000000..8e0f462
--- /dev/null
+++ b/sim4dbutils/cleanPolishes-experiments/evalThresh-gnuplot
@@ -0,0 +1,23 @@
+
+set terminal postscript color
+set output "evalThresh.ps"
+plot \
+ "evalThresh.dat" using 3 title "allSmallIntron/100" with lines, \
+ "evalThresh.dat" using 4 title "good" with lines, \
+ "evalThresh.dat" using 5 title "probably good" with lines, \
+ "evalThresh.dat" using 6 title "junkExonsLeft" with lines, \
+ "evalThresh.dat" using 7 title "junkExonsRight" with lines, \
+ "evalThresh.dat" using 8 title "junkExonsBoth" with lines, \
+ "evalThresh.dat" using 9 title "intronOnGap" with lines
+plot [][0:10000] \
+ "evalThresh.dat" using 3 title "allSmallIntrons/100" with lines, \
+ "evalThresh.dat" using 4 title "good" with lines, \
+ "evalThresh.dat" using 5 title "probably good" with lines, \
+ "evalThresh.dat" using 6 title "junkExonsLeft" with lines, \
+ "evalThresh.dat" using 7 title "junkExonsRight" with lines, \
+ "evalThresh.dat" using 8 title "junkExonsBoth" with lines, \
+ "evalThresh.dat" using 9 title "intronOnGap" with lines
+
+ "evalThresh.dat" using 2 title "oneExon/100" with lines, \
+ "evalThresh.dat" using 10 title "total/100" with lines
+
diff --git a/sim4dbutils/cleanPolishes-experiments/evalThresh-plot.pl b/sim4dbutils/cleanPolishes-experiments/evalThresh-plot.pl
new file mode 100644
index 0000000..51d3a6a
--- /dev/null
+++ b/sim4dbutils/cleanPolishes-experiments/evalThresh-plot.pl
@@ -0,0 +1,25 @@
+open(F, "< evalThresh.pl.out");
+
+while (!eof(F)) {
+ $_ = <F>;
+ if (m/at least (\d+)bp/) {
+ print "$1\t";
+ $_ = <F>;
+ $_ = <F>;if (m/oneExon:\s+(\d+)/) { print "$1\t"; } else { print STDERR "no 1\n"; }
+ $_ = <F>;if (m/allSmall.*:\s+(\d+)/) { print "$1\t"; } else { print STDERR "no 2\n"; }
+ $_ = <F>;if (m/good:\s+(\d+)/) { print "$1\t"; } else { print STDERR "no 3\n"; }
+ $_ = <F>;if (m/probably\sgood:\s+(\d+)/) { print "$1\t"; } else { print STDERR "no 4\n"; }
+ $_ = <F>;if (m/junkExonsLeft:\s+(\d+)/) { print "$1\t"; } else { print STDERR "no 5\n"; }
+ $_ = <F>;if (m/junkExonsRight:\s+(\d+)/) { print "$1\t"; } else { print STDERR "no 6\n"; }
+ $_ = <F>;if (m/junkExonsBoth:\s+(\d+)/) { print "$1\t"; } else { print STDERR "no 7\n"; }
+ $_ = <F>;if (m/intronOnGap:\s+(\d+)/) { print "$1\t"; } else { print STDERR "no 8\n"; }
+ $_ = <F>;if (m/total:\s+(\d+)/) { print "$1"; } else { print STDERR "no 9\n"; }
+ print "\n";
+ }
+}
+
+close(F);
+
+
+
+
diff --git a/sim4dbutils/cleanPolishes-experiments/evalThresh.dat b/sim4dbutils/cleanPolishes-experiments/evalThresh.dat
new file mode 100644
index 0000000..34148ce
--- /dev/null
+++ b/sim4dbutils/cleanPolishes-experiments/evalThresh.dat
@@ -0,0 +1,49 @@
+100 26569.68 15241.88 125574 36280 19490 18815 28 15100 43964.43
+200 26569.68 16348.56 55684 15887 11615 10569 21 10843 43964.43
+300 26569.68 16694.3 34761 9604 9384 8675 19 7602 43964.43
+400 26569.68 16908.98 22957 6061 7761 7483 19 4296 43964.43
+500 26569.68 17006.93 16879 4490 7095 7000 19 3299 43964.43
+600 26569.68 17069.78 12632 3424 6726 6544 19 3152 43964.43
+700 26569.68 17117.3 9407 2600 6440 6268 19 3011 43964.43
+800 26569.68 17149.12 7388 2165 6173 6005 19 2813 43964.43
+900 26569.68 17177.45 5703 1682 5981 5792 17 2555 43964.43
+1000 26569.68 17199.78 4589 1405 5785 5653 17 2048 43964.43
+1100 26569.68 17217.13 3601 1147 5655 5492 17 1850 43964.43
+1200 26569.68 17229.34 2983 867 5566 5353 17 1755 43964.43
+1300 26569.68 17238.96 2464 751 5463 5268 17 1616 43964.43
+1400 26569.68 17247.11 2136 691 5202 5192 17 1526 43964.43
+1500 26569.68 17252.9 1914 621 5135 5123 17 1375 43964.43
+1600 26569.68 17260.33 1550 512 5064 5062 17 1237 43964.43
+1700 26569.68 17265.02 1373 449 5009 4997 17 1128 43964.43
+1800 26569.68 17269.57 1195 410 4957 4918 17 1021 43964.43
+1900 26569.68 17272.81 1087 375 4912 4854 17 949 43964.43
+2000 26569.68 17275.65 990 363 4860 4792 17 888 43964.43
+2100 26569.68 17277.99 952 340 4822 4748 17 797 43964.43
+2200 26569.68 17279.99 890 329 4777 4706 17 757 43964.43
+2300 26569.68 17282.22 807 316 4733 4673 17 707 43964.43
+2400 26569.68 17283.81 751 309 4695 4640 17 682 43964.43
+2500 26569.68 17285.59 713 296 4653 4608 17 629 43964.43
+2600 26569.68 17286.87 681 289 4624 4577 17 600 43964.43
+2700 26569.68 17288.16 654 277 4588 4547 17 576 43964.43
+2800 26569.68 17289.58 627 266 4547 4509 17 551 43964.43
+2900 26569.68 17291.15 569 253 4504 4483 17 534 43964.43
+3000 26569.68 17292.29 547 248 4469 4452 17 513 43964.43
+3100 26569.68 17294.12 511 222 4433 4392 17 488 43964.43
+3200 26569.68 17294.98 497 218 4393 4371 16 482 43964.43
+3300 26569.68 17296.39 458 206 4368 4351 16 437 43964.43
+3400 26569.68 17297.57 449 205 4329 4330 16 389 43964.43
+3500 26569.68 17298.32 442 203 4302 4300 16 380 43964.43
+3600 26569.68 17299.23 431 199 4273 4270 16 363 43964.43
+3700 26569.68 17300.17 412 198 4247 4239 15 347 43964.43
+3800 26569.68 17300.84 406 194 4218 4217 15 341 43964.43
+3900 26569.68 17301.68 401 191 4187 4190 15 323 43964.43
+4000 26569.68 17302.68 379 172 4158 4162 15 321 43964.43
+4100 26569.68 17303.31 366 171 4142 4137 15 313 43964.43
+4200 26569.68 17303.89 364 168 4116 4116 15 307 43964.43
+4300 26569.68 17304.48 361 168 4094 4089 15 300 43964.43
+4400 26569.68 17305.03 358 167 4076 4064 15 292 43964.43
+4500 26569.68 17305.51 355 167 4055 4043 15 289 43964.43
+4600 26569.68 17306.05 351 166 4038 4014 15 286 43964.43
+4700 26569.68 17306.66 349 161 4015 3992 15 277 43964.43
+4800 26569.68 17307.21 348 159 3987 3973 15 272 43964.43
+4900 26569.68 17307.92 343 156 3959 3943 15 267 43964.43
diff --git a/sim4dbutils/cleanPolishes-experiments/evalThresh.pl b/sim4dbutils/cleanPolishes-experiments/evalThresh.pl
new file mode 100644
index 0000000..f22d16d
--- /dev/null
+++ b/sim4dbutils/cleanPolishes-experiments/evalThresh.pl
@@ -0,0 +1,9 @@
+#!/bin/perl
+
+$threshold = 10000;
+
+while ($threshold < 500000) {
+ print "THRESHOLD = $threshold\n";
+ system("./splitMatches -qquiet -threshold $threshold < /part3/polishes-good");
+ $threshold += 10000
+}
diff --git a/sim4dbutils/cleanPolishes-experiments/evalThresh.pl.out b/sim4dbutils/cleanPolishes-experiments/evalThresh.pl.out
new file mode 100644
index 0000000..dd8544e
--- /dev/null
+++ b/sim4dbutils/cleanPolishes-experiments/evalThresh.pl.out
@@ -0,0 +1,637 @@
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 10000bp long.
+
+oneExon: 2656968
+allSmallExons: 1524188
+good: 125574
+probably good: 36280
+junkExonsLeft: 19490
+junkExonsRight: 18815
+junkExonsBoth: 28
+intronOnGap: 15100
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 20000bp long.
+
+oneExon: 2656968
+allSmallExons: 1634856
+good: 55684
+probably good: 15887
+junkExonsLeft: 11615
+junkExonsRight: 10569
+junkExonsBoth: 21
+intronOnGap: 10843
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 30000bp long.
+
+oneExon: 2656968
+allSmallExons: 1669430
+good: 34761
+probably good: 9604
+junkExonsLeft: 9384
+junkExonsRight: 8675
+junkExonsBoth: 19
+intronOnGap: 7602
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 40000bp long.
+
+oneExon: 2656968
+allSmallExons: 1690898
+good: 22957
+probably good: 6061
+junkExonsLeft: 7761
+junkExonsRight: 7483
+junkExonsBoth: 19
+intronOnGap: 4296
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 50000bp long.
+
+oneExon: 2656968
+allSmallExons: 1700693
+good: 16879
+probably good: 4490
+junkExonsLeft: 7095
+junkExonsRight: 7000
+junkExonsBoth: 19
+intronOnGap: 3299
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 60000bp long.
+
+oneExon: 2656968
+allSmallExons: 1706978
+good: 12632
+probably good: 3424
+junkExonsLeft: 6726
+junkExonsRight: 6544
+junkExonsBoth: 19
+intronOnGap: 3152
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 70000bp long.
+
+oneExon: 2656968
+allSmallExons: 1711730
+good: 9407
+probably good: 2600
+junkExonsLeft: 6440
+junkExonsRight: 6268
+junkExonsBoth: 19
+intronOnGap: 3011
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 80000bp long.
+
+oneExon: 2656968
+allSmallExons: 1714912
+good: 7388
+probably good: 2165
+junkExonsLeft: 6173
+junkExonsRight: 6005
+junkExonsBoth: 19
+intronOnGap: 2813
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 90000bp long.
+
+oneExon: 2656968
+allSmallExons: 1717745
+good: 5703
+probably good: 1682
+junkExonsLeft: 5981
+junkExonsRight: 5792
+junkExonsBoth: 17
+intronOnGap: 2555
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 100000bp long.
+
+oneExon: 2656968
+allSmallExons: 1719978
+good: 4589
+probably good: 1405
+junkExonsLeft: 5785
+junkExonsRight: 5653
+junkExonsBoth: 17
+intronOnGap: 2048
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 110000bp long.
+
+oneExon: 2656968
+allSmallExons: 1721713
+good: 3601
+probably good: 1147
+junkExonsLeft: 5655
+junkExonsRight: 5492
+junkExonsBoth: 17
+intronOnGap: 1850
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 120000bp long.
+
+oneExon: 2656968
+allSmallExons: 1722934
+good: 2983
+probably good: 867
+junkExonsLeft: 5566
+junkExonsRight: 5353
+junkExonsBoth: 17
+intronOnGap: 1755
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 130000bp long.
+
+oneExon: 2656968
+allSmallExons: 1723896
+good: 2464
+probably good: 751
+junkExonsLeft: 5463
+junkExonsRight: 5268
+junkExonsBoth: 17
+intronOnGap: 1616
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 140000bp long.
+
+oneExon: 2656968
+allSmallExons: 1724711
+good: 2136
+probably good: 691
+junkExonsLeft: 5202
+junkExonsRight: 5192
+junkExonsBoth: 17
+intronOnGap: 1526
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 150000bp long.
+
+oneExon: 2656968
+allSmallExons: 1725290
+good: 1914
+probably good: 621
+junkExonsLeft: 5135
+junkExonsRight: 5123
+junkExonsBoth: 17
+intronOnGap: 1375
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 160000bp long.
+
+oneExon: 2656968
+allSmallExons: 1726033
+good: 1550
+probably good: 512
+junkExonsLeft: 5064
+junkExonsRight: 5062
+junkExonsBoth: 17
+intronOnGap: 1237
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 170000bp long.
+
+oneExon: 2656968
+allSmallExons: 1726502
+good: 1373
+probably good: 449
+junkExonsLeft: 5009
+junkExonsRight: 4997
+junkExonsBoth: 17
+intronOnGap: 1128
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 180000bp long.
+
+oneExon: 2656968
+allSmallExons: 1726957
+good: 1195
+probably good: 410
+junkExonsLeft: 4957
+junkExonsRight: 4918
+junkExonsBoth: 17
+intronOnGap: 1021
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 190000bp long.
+
+oneExon: 2656968
+allSmallExons: 1727281
+good: 1087
+probably good: 375
+junkExonsLeft: 4912
+junkExonsRight: 4854
+junkExonsBoth: 17
+intronOnGap: 949
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 200000bp long.
+
+oneExon: 2656968
+allSmallExons: 1727565
+good: 990
+probably good: 363
+junkExonsLeft: 4860
+junkExonsRight: 4792
+junkExonsBoth: 17
+intronOnGap: 888
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 210000bp long.
+
+oneExon: 2656968
+allSmallExons: 1727799
+good: 952
+probably good: 340
+junkExonsLeft: 4822
+junkExonsRight: 4748
+junkExonsBoth: 17
+intronOnGap: 797
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 220000bp long.
+
+oneExon: 2656968
+allSmallExons: 1727999
+good: 890
+probably good: 329
+junkExonsLeft: 4777
+junkExonsRight: 4706
+junkExonsBoth: 17
+intronOnGap: 757
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 230000bp long.
+
+oneExon: 2656968
+allSmallExons: 1728222
+good: 807
+probably good: 316
+junkExonsLeft: 4733
+junkExonsRight: 4673
+junkExonsBoth: 17
+intronOnGap: 707
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 240000bp long.
+
+oneExon: 2656968
+allSmallExons: 1728381
+good: 751
+probably good: 309
+junkExonsLeft: 4695
+junkExonsRight: 4640
+junkExonsBoth: 17
+intronOnGap: 682
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 250000bp long.
+
+oneExon: 2656968
+allSmallExons: 1728559
+good: 713
+probably good: 296
+junkExonsLeft: 4653
+junkExonsRight: 4608
+junkExonsBoth: 17
+intronOnGap: 629
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 260000bp long.
+
+oneExon: 2656968
+allSmallExons: 1728687
+good: 681
+probably good: 289
+junkExonsLeft: 4624
+junkExonsRight: 4577
+junkExonsBoth: 17
+intronOnGap: 600
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 270000bp long.
+
+oneExon: 2656968
+allSmallExons: 1728816
+good: 654
+probably good: 277
+junkExonsLeft: 4588
+junkExonsRight: 4547
+junkExonsBoth: 17
+intronOnGap: 576
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 280000bp long.
+
+oneExon: 2656968
+allSmallExons: 1728958
+good: 627
+probably good: 266
+junkExonsLeft: 4547
+junkExonsRight: 4509
+junkExonsBoth: 17
+intronOnGap: 551
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 290000bp long.
+
+oneExon: 2656968
+allSmallExons: 1729115
+good: 569
+probably good: 253
+junkExonsLeft: 4504
+junkExonsRight: 4483
+junkExonsBoth: 17
+intronOnGap: 534
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 300000bp long.
+
+oneExon: 2656968
+allSmallExons: 1729229
+good: 547
+probably good: 248
+junkExonsLeft: 4469
+junkExonsRight: 4452
+junkExonsBoth: 17
+intronOnGap: 513
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 310000bp long.
+
+oneExon: 2656968
+allSmallExons: 1729412
+good: 511
+probably good: 222
+junkExonsLeft: 4433
+junkExonsRight: 4392
+junkExonsBoth: 17
+intronOnGap: 488
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 320000bp long.
+
+oneExon: 2656968
+allSmallExons: 1729498
+good: 497
+probably good: 218
+junkExonsLeft: 4393
+junkExonsRight: 4371
+junkExonsBoth: 16
+intronOnGap: 482
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 330000bp long.
+
+oneExon: 2656968
+allSmallExons: 1729639
+good: 458
+probably good: 206
+junkExonsLeft: 4368
+junkExonsRight: 4351
+junkExonsBoth: 16
+intronOnGap: 437
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 340000bp long.
+
+oneExon: 2656968
+allSmallExons: 1729757
+good: 449
+probably good: 205
+junkExonsLeft: 4329
+junkExonsRight: 4330
+junkExonsBoth: 16
+intronOnGap: 389
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 350000bp long.
+
+oneExon: 2656968
+allSmallExons: 1729832
+good: 442
+probably good: 203
+junkExonsLeft: 4302
+junkExonsRight: 4300
+junkExonsBoth: 16
+intronOnGap: 380
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 360000bp long.
+
+oneExon: 2656968
+allSmallExons: 1729923
+good: 431
+probably good: 199
+junkExonsLeft: 4273
+junkExonsRight: 4270
+junkExonsBoth: 16
+intronOnGap: 363
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 370000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730017
+good: 412
+probably good: 198
+junkExonsLeft: 4247
+junkExonsRight: 4239
+junkExonsBoth: 15
+intronOnGap: 347
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 380000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730084
+good: 406
+probably good: 194
+junkExonsLeft: 4218
+junkExonsRight: 4217
+junkExonsBoth: 15
+intronOnGap: 341
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 390000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730168
+good: 401
+probably good: 191
+junkExonsLeft: 4187
+junkExonsRight: 4190
+junkExonsBoth: 15
+intronOnGap: 323
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 400000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730268
+good: 379
+probably good: 172
+junkExonsLeft: 4158
+junkExonsRight: 4162
+junkExonsBoth: 15
+intronOnGap: 321
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 410000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730331
+good: 366
+probably good: 171
+junkExonsLeft: 4142
+junkExonsRight: 4137
+junkExonsBoth: 15
+intronOnGap: 313
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 420000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730389
+good: 364
+probably good: 168
+junkExonsLeft: 4116
+junkExonsRight: 4116
+junkExonsBoth: 15
+intronOnGap: 307
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 430000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730448
+good: 361
+probably good: 168
+junkExonsLeft: 4094
+junkExonsRight: 4089
+junkExonsBoth: 15
+intronOnGap: 300
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 440000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730503
+good: 358
+probably good: 167
+junkExonsLeft: 4076
+junkExonsRight: 4064
+junkExonsBoth: 15
+intronOnGap: 292
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 450000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730551
+good: 355
+probably good: 167
+junkExonsLeft: 4055
+junkExonsRight: 4043
+junkExonsBoth: 15
+intronOnGap: 289
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 460000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730605
+good: 351
+probably good: 166
+junkExonsLeft: 4038
+junkExonsRight: 4014
+junkExonsBoth: 15
+intronOnGap: 286
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 470000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730666
+good: 349
+probably good: 161
+junkExonsLeft: 4015
+junkExonsRight: 3992
+junkExonsBoth: 15
+intronOnGap: 277
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 480000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730721
+good: 348
+probably good: 159
+junkExonsLeft: 3987
+junkExonsRight: 3973
+junkExonsBoth: 15
+intronOnGap: 272
+total: 4396443
+REALLY QUIET MODE ENABLED -- NO matches are output!
+A big intron is one that is at least 490000bp long.
+
+oneExon: 2656968
+allSmallExons: 1730792
+good: 343
+probably good: 156
+junkExonsLeft: 3959
+junkExonsRight: 3943
+junkExonsBoth: 15
+intronOnGap: 267
+total: 4396443
+THRESHOLD = 10000
+THRESHOLD = 20000
+THRESHOLD = 30000
+THRESHOLD = 40000
+THRESHOLD = 50000
+THRESHOLD = 60000
+THRESHOLD = 70000
+THRESHOLD = 80000
+THRESHOLD = 90000
+THRESHOLD = 100000
+THRESHOLD = 110000
+THRESHOLD = 120000
+THRESHOLD = 130000
+THRESHOLD = 140000
+THRESHOLD = 150000
+THRESHOLD = 160000
+THRESHOLD = 170000
+THRESHOLD = 180000
+THRESHOLD = 190000
+THRESHOLD = 200000
+THRESHOLD = 210000
+THRESHOLD = 220000
+THRESHOLD = 230000
+THRESHOLD = 240000
+THRESHOLD = 250000
+THRESHOLD = 260000
+THRESHOLD = 270000
+THRESHOLD = 280000
+THRESHOLD = 290000
+THRESHOLD = 300000
+THRESHOLD = 310000
+THRESHOLD = 320000
+THRESHOLD = 330000
+THRESHOLD = 340000
+THRESHOLD = 350000
+THRESHOLD = 360000
+THRESHOLD = 370000
+THRESHOLD = 380000
+THRESHOLD = 390000
+THRESHOLD = 400000
+THRESHOLD = 410000
+THRESHOLD = 420000
+THRESHOLD = 430000
+THRESHOLD = 440000
+THRESHOLD = 450000
+THRESHOLD = 460000
+THRESHOLD = 470000
+THRESHOLD = 480000
+THRESHOLD = 490000
diff --git a/sim4dbutils/cleanPolishes-experiments/evalThresh.ps b/sim4dbutils/cleanPolishes-experiments/evalThresh.ps
new file mode 100644
index 0000000..6efbb7f
--- /dev/null
+++ b/sim4dbutils/cleanPolishes-experiments/evalThresh.ps
@@ -0,0 +1,1217 @@
+%!PS-Adobe-2.0
+%%Title: evalThresh.ps
+%%Creator: gnuplot 3.7 patchlevel 0
+%%CreationDate: Fri Jun 28 11:19:16 2002
+%%DocumentFonts: (atend)
+%%BoundingBox: 50 50 554 770
+%%Orientation: Landscape
+%%Pages: (atend)
+%%EndComments
+/gnudict 256 dict def
+gnudict begin
+/Color true def
+/Solid false def
+/gnulinewidth 5.000 def
+/userlinewidth gnulinewidth def
+/vshift -46 def
+/dl {10 mul} def
+/hpt_ 31.5 def
+/vpt_ 31.5 def
+/hpt hpt_ def
+/vpt vpt_ def
+/M {moveto} bind def
+/L {lineto} bind def
+/R {rmoveto} bind def
+/V {rlineto} bind def
+/vpt2 vpt 2 mul def
+/hpt2 hpt 2 mul def
+/Lshow { currentpoint stroke M
+ 0 vshift R show } def
+/Rshow { currentpoint stroke M
+ dup stringwidth pop neg vshift R show } def
+/Cshow { currentpoint stroke M
+ dup stringwidth pop -2 div vshift R show } def
+/UP { dup vpt_ mul /vpt exch def hpt_ mul /hpt exch def
+ /hpt2 hpt 2 mul def /vpt2 vpt 2 mul def } def
+/DL { Color {setrgbcolor Solid {pop []} if 0 setdash }
+ {pop pop pop Solid {pop []} if 0 setdash} ifelse } def
+/BL { stroke gnulinewidth 2 mul setlinewidth } def
+/AL { stroke gnulinewidth 2 div setlinewidth } def
+/UL { gnulinewidth mul /userlinewidth exch def } def
+/PL { stroke userlinewidth setlinewidth } def
+/LTb { BL [] 0 0 0 DL } def
+/LTa { AL [1 dl 2 dl] 0 setdash 0 0 0 setrgbcolor } def
+/LT0 { PL [] 1 0 0 DL } def
+/LT1 { PL [4 dl 2 dl] 0 1 0 DL } def
+/LT2 { PL [2 dl 3 dl] 0 0 1 DL } def
+/LT3 { PL [1 dl 1.5 dl] 1 0 1 DL } def
+/LT4 { PL [5 dl 2 dl 1 dl 2 dl] 0 1 1 DL } def
+/LT5 { PL [4 dl 3 dl 1 dl 3 dl] 1 1 0 DL } def
+/LT6 { PL [2 dl 2 dl 2 dl 4 dl] 0 0 0 DL } def
+/LT7 { PL [2 dl 2 dl 2 dl 2 dl 2 dl 4 dl] 1 0.3 0 DL } def
+/LT8 { PL [2 dl 2 dl 2 dl 2 dl 2 dl 2 dl 2 dl 4 dl] 0.5 0.5 0.5 DL } def
+/Pnt { stroke [] 0 setdash
+ gsave 1 setlinecap M 0 0 V stroke grestore } def
+/Dia { stroke [] 0 setdash 2 copy vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath stroke
+ Pnt } def
+/Pls { stroke [] 0 setdash vpt sub M 0 vpt2 V
+ currentpoint stroke M
+ hpt neg vpt neg R hpt2 0 V stroke
+ } def
+/Box { stroke [] 0 setdash 2 copy exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath stroke
+ Pnt } def
+/Crs { stroke [] 0 setdash exch hpt sub exch vpt add M
+ hpt2 vpt2 neg V currentpoint stroke M
+ hpt2 neg 0 R hpt2 vpt2 V stroke } def
+/TriU { stroke [] 0 setdash 2 copy vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath stroke
+ Pnt } def
+/Star { 2 copy Pls Crs } def
+/BoxF { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath fill } def
+/TriUF { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath fill } def
+/TriD { stroke [] 0 setdash 2 copy vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath stroke
+ Pnt } def
+/TriDF { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath fill} def
+/DiaF { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath fill } def
+/Pent { stroke [] 0 setdash 2 copy gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath stroke grestore Pnt } def
+/PentF { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath fill grestore } def
+/Circle { stroke [] 0 setdash 2 copy
+ hpt 0 360 arc stroke Pnt } def
+/CircleF { stroke [] 0 setdash hpt 0 360 arc fill } def
+/C0 { BL [] 0 setdash 2 copy moveto vpt 90 450 arc } bind def
+/C1 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C2 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C3 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C4 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 180 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C5 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc
+ 2 copy moveto
+ 2 copy vpt 180 270 arc closepath fill
+ vpt 0 360 arc } bind def
+/C6 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C7 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 270 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C8 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 270 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C9 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 270 450 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C10 { BL [] 0 setdash 2 copy 2 copy moveto vpt 270 360 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 90 180 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C11 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 180 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 270 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C12 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 180 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C13 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 0 90 arc closepath fill
+ 2 copy moveto
+ 2 copy vpt 180 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/C14 { BL [] 0 setdash 2 copy moveto
+ 2 copy vpt 90 360 arc closepath fill
+ vpt 0 360 arc } bind def
+/C15 { BL [] 0 setdash 2 copy vpt 0 360 arc closepath fill
+ vpt 0 360 arc closepath } bind def
+/Rec { newpath 4 2 roll moveto 1 index 0 rlineto 0 exch rlineto
+ neg 0 rlineto closepath } bind def
+/Square { dup Rec } bind def
+/Bsquare { vpt sub exch vpt sub exch vpt2 Square } bind def
+/S0 { BL [] 0 setdash 2 copy moveto 0 vpt rlineto BL Bsquare } bind def
+/S1 { BL [] 0 setdash 2 copy vpt Square fill Bsquare } bind def
+/S2 { BL [] 0 setdash 2 copy exch vpt sub exch vpt Square fill Bsquare } bind def
+/S3 { BL [] 0 setdash 2 copy exch vpt sub exch vpt2 vpt Rec fill Bsquare } bind def
+/S4 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt Square fill Bsquare } bind def
+/S5 { BL [] 0 setdash 2 copy 2 copy vpt Square fill
+ exch vpt sub exch vpt sub vpt Square fill Bsquare } bind def
+/S6 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill Bsquare } bind def
+/S7 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt vpt2 Rec fill
+ 2 copy vpt Square fill
+ Bsquare } bind def
+/S8 { BL [] 0 setdash 2 copy vpt sub vpt Square fill Bsquare } bind def
+/S9 { BL [] 0 setdash 2 copy vpt sub vpt vpt2 Rec fill Bsquare } bind def
+/S10 { BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt Square fill
+ Bsquare } bind def
+/S11 { BL [] 0 setdash 2 copy vpt sub vpt Square fill 2 copy exch vpt sub exch vpt2 vpt Rec fill
+ Bsquare } bind def
+/S12 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill Bsquare } bind def
+/S13 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+ 2 copy vpt Square fill Bsquare } bind def
+/S14 { BL [] 0 setdash 2 copy exch vpt sub exch vpt sub vpt2 vpt Rec fill
+ 2 copy exch vpt sub exch vpt Square fill Bsquare } bind def
+/S15 { BL [] 0 setdash 2 copy Bsquare fill Bsquare } bind def
+/D0 { gsave translate 45 rotate 0 0 S0 stroke grestore } bind def
+/D1 { gsave translate 45 rotate 0 0 S1 stroke grestore } bind def
+/D2 { gsave translate 45 rotate 0 0 S2 stroke grestore } bind def
+/D3 { gsave translate 45 rotate 0 0 S3 stroke grestore } bind def
+/D4 { gsave translate 45 rotate 0 0 S4 stroke grestore } bind def
+/D5 { gsave translate 45 rotate 0 0 S5 stroke grestore } bind def
+/D6 { gsave translate 45 rotate 0 0 S6 stroke grestore } bind def
+/D7 { gsave translate 45 rotate 0 0 S7 stroke grestore } bind def
+/D8 { gsave translate 45 rotate 0 0 S8 stroke grestore } bind def
+/D9 { gsave translate 45 rotate 0 0 S9 stroke grestore } bind def
+/D10 { gsave translate 45 rotate 0 0 S10 stroke grestore } bind def
+/D11 { gsave translate 45 rotate 0 0 S11 stroke grestore } bind def
+/D12 { gsave translate 45 rotate 0 0 S12 stroke grestore } bind def
+/D13 { gsave translate 45 rotate 0 0 S13 stroke grestore } bind def
+/D14 { gsave translate 45 rotate 0 0 S14 stroke grestore } bind def
+/D15 { gsave translate 45 rotate 0 0 S15 stroke grestore } bind def
+/DiaE { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V closepath stroke } def
+/BoxE { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V closepath stroke } def
+/TriUE { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V closepath stroke } def
+/TriDE { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V closepath stroke } def
+/PentE { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ closepath stroke grestore } def
+/CircE { stroke [] 0 setdash
+ hpt 0 360 arc stroke } def
+/Opaque { gsave closepath 1 setgray fill grestore 0 setgray closepath } def
+/DiaW { stroke [] 0 setdash vpt add M
+ hpt neg vpt neg V hpt vpt neg V
+ hpt vpt V hpt neg vpt V Opaque stroke } def
+/BoxW { stroke [] 0 setdash exch hpt sub exch vpt add M
+ 0 vpt2 neg V hpt2 0 V 0 vpt2 V
+ hpt2 neg 0 V Opaque stroke } def
+/TriUW { stroke [] 0 setdash vpt 1.12 mul add M
+ hpt neg vpt -1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt 1.62 mul V Opaque stroke } def
+/TriDW { stroke [] 0 setdash vpt 1.12 mul sub M
+ hpt neg vpt 1.62 mul V
+ hpt 2 mul 0 V
+ hpt neg vpt -1.62 mul V Opaque stroke } def
+/PentW { stroke [] 0 setdash gsave
+ translate 0 hpt M 4 {72 rotate 0 hpt L} repeat
+ Opaque stroke grestore } def
+/CircW { stroke [] 0 setdash
+ hpt 0 360 arc Opaque stroke } def
+/BoxFill { gsave Rec 1 setgray fill grestore } def
+end
+%%EndProlog
+%%Page: 1 1
+gnudict begin
+gsave
+50 50 translate
+0.100 0.100 scale
+90 rotate
+0 -5040 translate
+0 setgray
+newpath
+(Helvetica) findfont 140 scalefont setfont
+1.000 UL
+LTb
+742 280 M
+63 0 V
+6157 0 R
+-63 0 V
+658 280 M
+(0) Rshow
+742 936 M
+63 0 V
+6157 0 R
+-63 0 V
+658 936 M
+(20000) Rshow
+742 1592 M
+63 0 V
+6157 0 R
+-63 0 V
+-6241 0 R
+(40000) Rshow
+742 2248 M
+63 0 V
+6157 0 R
+-63 0 V
+-6241 0 R
+(60000) Rshow
+742 2904 M
+63 0 V
+6157 0 R
+-63 0 V
+-6241 0 R
+(80000) Rshow
+742 3560 M
+63 0 V
+6157 0 R
+-63 0 V
+-6241 0 R
+(100000) Rshow
+742 4216 M
+63 0 V
+6157 0 R
+-63 0 V
+-6241 0 R
+(120000) Rshow
+742 4872 M
+63 0 V
+6157 0 R
+-63 0 V
+-6241 0 R
+(140000) Rshow
+742 280 M
+0 63 V
+0 4529 R
+0 -63 V
+742 140 M
+(0) Cshow
+1364 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(5) Cshow
+1986 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(10) Cshow
+2608 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(15) Cshow
+3230 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(20) Cshow
+3852 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(25) Cshow
+4474 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(30) Cshow
+5096 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(35) Cshow
+5718 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(40) Cshow
+6340 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(45) Cshow
+6962 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(50) Cshow
+1.000 UL
+LTb
+742 280 M
+6220 0 V
+0 4592 V
+-6220 0 V
+742 280 L
+1.000 UL
+LT0
+6311 4739 M
+(allSmallExons/100) Rshow
+6395 4739 M
+399 0 V
+742 780 M
+124 36 V
+125 12 V
+124 7 V
+125 3 V
+124 2 V
+124 1 V
+125 1 V
+124 1 V
+125 1 V
+124 1 V
+124 0 V
+125 0 V
+124 1 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 1 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 1 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+1.000 UL
+LT1
+6311 4599 M
+(good) Rshow
+6395 4599 M
+399 0 V
+742 4399 M
+866 2106 L
+991 1420 L
+124 -387 V
+1240 834 L
+1364 694 L
+1488 589 L
+125 -67 V
+124 -55 V
+125 -36 V
+124 -33 V
+124 -20 V
+125 -17 V
+124 -11 V
+125 -7 V
+124 -12 V
+124 -6 V
+125 -6 V
+124 -3 V
+125 -4 V
+124 -1 V
+124 -2 V
+125 -3 V
+124 -1 V
+125 -2 V
+124 -1 V
+124 -1 V
+125 0 V
+124 -2 V
+125 -1 V
+124 -1 V
+124 -1 V
+125 -1 V
+124 0 V
+125 -1 V
+124 0 V
+124 0 V
+125 -1 V
+124 0 V
+125 -1 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 -1 V
+125 0 V
+124 0 V
+1.000 UL
+LT2
+6311 4459 M
+(probably good) Rshow
+6395 4459 M
+399 0 V
+742 1470 M
+866 801 L
+991 595 L
+1115 479 L
+125 -52 V
+124 -35 V
+124 -27 V
+125 -14 V
+124 -16 V
+125 -9 V
+124 -8 V
+124 -10 V
+125 -3 V
+124 -2 V
+125 -3 V
+124 -3 V
+124 -2 V
+125 -2 V
+124 -1 V
+125 0 V
+124 -1 V
+124 0 V
+125 -1 V
+124 0 V
+125 0 V
+124 -1 V
+124 0 V
+125 0 V
+124 -1 V
+125 0 V
+124 -1 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 -1 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 -1 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+1.000 UL
+LT3
+6311 4319 M
+(junkExonsLeft) Rshow
+6395 4319 M
+399 0 V
+742 919 M
+866 661 L
+991 588 L
+124 -53 V
+125 -22 V
+124 -12 V
+124 -10 V
+125 -9 V
+124 -6 V
+125 -6 V
+124 -5 V
+124 -2 V
+125 -4 V
+124 -8 V
+125 -3 V
+124 -2 V
+124 -2 V
+125 -1 V
+124 -2 V
+125 -2 V
+124 -1 V
+124 -1 V
+125 -2 V
+124 -1 V
+125 -1 V
+124 -1 V
+124 -2 V
+125 -1 V
+124 -1 V
+125 -1 V
+124 -2 V
+124 -1 V
+125 -1 V
+124 -1 V
+125 -1 V
+124 -1 V
+124 -1 V
+125 -1 V
+124 -1 V
+125 -1 V
+124 0 V
+124 -1 V
+125 -1 V
+124 0 V
+125 -1 V
+124 -1 V
+124 0 V
+125 -1 V
+124 -1 V
+1.000 UL
+LT4
+6311 4179 M
+(junkExonsRight) Rshow
+6395 4179 M
+399 0 V
+742 897 M
+866 627 L
+991 565 L
+124 -40 V
+125 -15 V
+124 -15 V
+124 -9 V
+125 -9 V
+124 -7 V
+125 -5 V
+124 -5 V
+124 -4 V
+125 -3 V
+124 -3 V
+125 -2 V
+124 -2 V
+124 -2 V
+125 -3 V
+124 -2 V
+125 -2 V
+124 -1 V
+124 -2 V
+125 -1 V
+124 -1 V
+125 -1 V
+124 -1 V
+124 -1 V
+125 -1 V
+124 -1 V
+125 -1 V
+124 -2 V
+124 -1 V
+125 0 V
+124 -1 V
+125 -1 V
+124 -1 V
+124 -1 V
+125 -1 V
+124 -1 V
+125 0 V
+124 -1 V
+124 -1 V
+125 -1 V
+124 -1 V
+125 0 V
+124 -1 V
+124 -1 V
+125 -1 V
+124 -1 V
+1.000 UL
+LT5
+6311 4039 M
+(junkExonsBoth) Rshow
+6395 4039 M
+399 0 V
+742 281 M
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 -1 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+125 0 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+1.000 UL
+LT6
+6311 3899 M
+(intronOnGap) Rshow
+6395 3899 M
+399 0 V
+742 775 M
+866 636 L
+991 529 L
+1115 421 L
+125 -33 V
+124 -5 V
+124 -4 V
+125 -7 V
+124 -8 V
+125 -17 V
+124 -6 V
+124 -3 V
+125 -5 V
+124 -3 V
+125 -5 V
+124 -4 V
+124 -4 V
+125 -4 V
+124 -2 V
+125 -2 V
+124 -3 V
+124 -1 V
+125 -2 V
+124 -1 V
+125 -1 V
+124 -1 V
+124 -1 V
+125 -1 V
+124 0 V
+125 -1 V
+124 -1 V
+124 0 V
+125 -2 V
+124 -1 V
+125 -1 V
+124 0 V
+124 -1 V
+125 0 V
+124 0 V
+125 0 V
+124 -1 V
+124 0 V
+125 0 V
+124 0 V
+125 -1 V
+124 0 V
+124 0 V
+125 0 V
+124 0 V
+stroke
+grestore
+end
+showpage
+%%Page: 2 2
+gnudict begin
+gsave
+50 50 translate
+0.100 0.100 scale
+90 rotate
+0 -5040 translate
+0 setgray
+newpath
+(Helvetica) findfont 140 scalefont setfont
+1.000 UL
+LTb
+658 280 M
+63 0 V
+6241 0 R
+-63 0 V
+574 280 M
+(0) Rshow
+658 1198 M
+63 0 V
+6241 0 R
+-63 0 V
+-6325 0 R
+(2000) Rshow
+658 2117 M
+63 0 V
+6241 0 R
+-63 0 V
+-6325 0 R
+(4000) Rshow
+658 3035 M
+63 0 V
+6241 0 R
+-63 0 V
+-6325 0 R
+(6000) Rshow
+658 3954 M
+63 0 V
+6241 0 R
+-63 0 V
+-6325 0 R
+(8000) Rshow
+658 4872 M
+63 0 V
+6241 0 R
+-63 0 V
+-6325 0 R
+(10000) Rshow
+658 280 M
+0 63 V
+0 4529 R
+0 -63 V
+658 140 M
+(0) Cshow
+1288 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(5) Cshow
+1919 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(10) Cshow
+2549 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(15) Cshow
+3180 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(20) Cshow
+3810 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(25) Cshow
+4440 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(30) Cshow
+5071 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(35) Cshow
+5701 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(40) Cshow
+6332 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(45) Cshow
+6962 280 M
+0 63 V
+0 4529 R
+0 -63 V
+0 -4669 R
+(50) Cshow
+1.000 UL
+LTb
+658 280 M
+6304 0 V
+0 4592 V
+-6304 0 V
+658 280 L
+1.000 UL
+LT0
+6311 4739 M
+(allSmallExons/100) Rshow
+6395 4739 M
+399 0 V
+1.000 UL
+LT1
+6311 4599 M
+(good) Rshow
+6395 4599 M
+399 0 V
+1391 4872 M
+23 -272 V
+127 -927 V
+126 -774 V
+126 -512 V
+126 -453 V
+126 -284 V
+126 -239 V
+126 -150 V
+126 -102 V
+2549 992 L
+126 -82 V
+126 -81 V
+126 -50 V
+127 -44 V
+126 -18 V
+126 -28 V
+126 -38 V
+126 -26 V
+126 -18 V
+126 -14 V
+126 -13 V
+126 -12 V
+126 -27 V
+126 -10 V
+126 -16 V
+126 -7 V
+127 -18 V
+126 -4 V
+126 -3 V
+126 -5 V
+126 -9 V
+126 -3 V
+126 -2 V
+126 -10 V
+126 -6 V
+126 -1 V
+126 -1 V
+126 -2 V
+127 -1 V
+126 -2 V
+126 -1 V
+126 0 V
+126 -2 V
+1.000 UL
+LT2
+6311 4459 M
+(probably good) Rshow
+6395 4459 M
+399 0 V
+902 4872 M
+8 -182 V
+1036 3063 L
+126 -721 V
+126 -490 V
+126 -378 V
+127 -200 V
+126 -222 V
+1793 925 L
+1919 807 L
+2045 678 L
+126 -53 V
+126 -28 V
+126 -32 V
+126 -50 V
+126 -29 V
+126 -18 V
+126 -16 V
+127 -5 V
+126 -11 V
+126 -5 V
+126 -6 V
+126 -3 V
+126 -6 V
+126 -3 V
+126 -6 V
+126 -5 V
+126 -6 V
+126 -2 V
+126 -12 V
+126 -2 V
+127 -5 V
+126 -1 V
+126 -1 V
+126 -2 V
+126 0 V
+126 -2 V
+126 -1 V
+126 -9 V
+126 0 V
+126 -2 V
+126 0 V
+126 0 V
+127 0 V
+126 -1 V
+126 -2 V
+126 -1 V
+126 -1 V
+1.000 UL
+LT3
+6311 4319 M
+(junkExonsLeft) Rshow
+6395 4319 M
+399 0 V
+875 4872 M
+35 -283 V
+126 -745 V
+126 -306 V
+126 -169 V
+126 -132 V
+127 -122 V
+126 -89 V
+126 -90 V
+126 -59 V
+126 -41 V
+126 -47 V
+126 -120 V
+126 -31 V
+126 -33 V
+126 -25 V
+126 -24 V
+126 -20 V
+127 -24 V
+126 -18 V
+126 -20 V
+126 -21 V
+126 -17 V
+126 -19 V
+126 -14 V
+126 -16 V
+126 -19 V
+126 -20 V
+126 -16 V
+126 -16 V
+126 -19 V
+127 -11 V
+126 -18 V
+126 -13 V
+126 -13 V
+126 -12 V
+126 -13 V
+126 -14 V
+126 -14 V
+126 -7 V
+126 -12 V
+126 -10 V
+126 -8 V
+127 -10 V
+126 -8 V
+126 -10 V
+126 -13 V
+126 -13 V
+1.000 UL
+LT4
+6311 4179 M
+(junkExonsRight) Rshow
+6395 4179 M
+399 0 V
+822 4872 M
+88 -608 V
+126 -548 V
+126 -222 V
+126 -209 V
+126 -127 V
+127 -121 V
+126 -97 V
+126 -64 V
+126 -74 V
+126 -64 V
+126 -39 V
+126 -35 V
+126 -32 V
+126 -28 V
+126 -29 V
+126 -37 V
+126 -29 V
+127 -29 V
+126 -20 V
+126 -19 V
+126 -15 V
+126 -15 V
+126 -15 V
+126 -14 V
+126 -14 V
+126 -17 V
+126 -12 V
+126 -15 V
+126 -27 V
+126 -10 V
+127 -9 V
+126 -10 V
+126 -13 V
+126 -14 V
+126 -14 V
+126 -11 V
+126 -12 V
+126 -13 V
+126 -11 V
+126 -10 V
+126 -12 V
+126 -12 V
+127 -9 V
+126 -14 V
+126 -10 V
+126 -9 V
+126 -13 V
+1.000 UL
+LT5
+6311 4039 M
+(junkExonsBoth) Rshow
+6395 4039 M
+399 0 V
+658 293 M
+126 -3 V
+126 -1 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+127 0 V
+126 -1 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+127 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 -1 V
+127 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+127 0 V
+126 0 V
+126 0 V
+126 0 V
+126 0 V
+1.000 UL
+LT6
+6311 3899 M
+(intronOnGap) Rshow
+6395 3899 M
+399 0 V
+817 4872 M
+910 3771 L
+1036 2253 L
+126 -458 V
+126 -68 V
+126 -64 V
+127 -91 V
+126 -119 V
+126 -233 V
+126 -90 V
+126 -44 V
+126 -64 V
+126 -41 V
+126 -70 V
+126 -63 V
+126 -50 V
+126 -49 V
+126 -33 V
+127 -28 V
+126 -42 V
+126 -18 V
+126 -23 V
+126 -12 V
+126 -24 V
+126 -13 V
+126 -12 V
+126 -11 V
+126 -8 V
+126 -9 V
+126 -12 V
+126 -3 V
+127 -20 V
+126 -22 V
+126 -5 V
+126 -7 V
+126 -8 V
+126 -2 V
+126 -9 V
+126 -1 V
+126 -3 V
+126 -3 V
+126 -3 V
+126 -4 V
+127 -1 V
+126 -2 V
+126 -4 V
+126 -2 V
+126 -2 V
+stroke
+grestore
+end
+showpage
+%%Trailer
+%%DocumentFonts: Helvetica
+%%Pages: 2
diff --git a/sim4dbutils/cleanPolishes-experiments/intronstats.pl b/sim4dbutils/cleanPolishes-experiments/intronstats.pl
new file mode 100644
index 0000000..89aa18b
--- /dev/null
+++ b/sim4dbutils/cleanPolishes-experiments/intronstats.pl
@@ -0,0 +1,137 @@
+#!/usr/local/bin/perl
+
+$| = 1;
+
+use strict;
+
+use FindBin;
+use lib "/home/walenzbp/projects/scripts";
+use libBri;
+
+my $tot = 0;
+my $sma = 0;
+my $big = 0;
+
+my $smafirst = 0;
+my $smalast = 0;
+my $bigfirst = 0;
+my $biglast = 0;
+
+my $smaoneintronF = 0;
+my $smaoneintronL = 0;
+my $smaoneintronB = 0;
+my $bigoneintron = 0;
+
+my $interiorintron = 0;
+
+my $ff=0;
+my $fc=0;
+my $lf=0;
+my $lc=0;
+
+my @bigA;
+my @smaA;
+
+open(SMA, "> sma-exon-after-big-intron");
+open(BIG, "> big-exon-after-big-intron");
+open(SMAO, "> sma-exon-after-big-oneintron");
+open(BIGO, "> big-exon-after-big-oneintron");
+
+while (!eof(STDIN)) {
+ $tot++;
+
+ my %p = &libBri::readPolish(*STDIN);
+ my $exonsLen = scalar(@{$p{'exons'}});
+ my $firstintron = 1;
+
+ if ($exonsLen > 1) {
+ my @exons = @{$p{'exons'}};
+
+ my $lastC = shift @exons;
+loop:
+ my $thisC = shift @exons;
+ my $gap = $thisC->{'GENOMICstart'} - $lastC->{'GENOMICend'};
+
+ if ($gap > 499999) {
+
+ if (($firstintron) && (scalar(@exons) == 0)) {
+ # Exactly one intron
+ #
+ if ((($lastC->{'cDNAend'} - $lastC->{'cDNAstart'}) < 50) &&
+ (($thisC->{'cDNAend'} - $thisC->{'cDNAstart'}) < 50)) {
+ $sma++;
+ $smaoneintronB++;
+ print SMAO $p{'raw'};
+ } elsif (($lastC->{'cDNAend'} - $lastC->{'cDNAstart'}) < 50) {
+ $sma++;
+ $smaoneintronF++;
+ print SMAO $p{'raw'};
+ } elsif (($thisC->{'cDNAend'} - $thisC->{'cDNAstart'}) < 50) {
+ $sma++;
+ $smaoneintronL++;
+ print SMAO $p{'raw'};
+ } else {
+ $big++;
+ $bigoneintron++;
+ print BIGO $p{'raw'};
+ }
+
+ } elsif ($firstintron) {
+ # First intron
+ #
+ if (($lastC->{'cDNAend'} - $lastC->{'cDNAstart'}) < 50) {
+ $sma++;
+ $smafirst++;
+ print SMA $p{'raw'};
+ if ($p{'matchOrientation'} eq "forward") {
+ $ff++;
+ } else {
+ $fc++;
+ }
+ } else {
+ $big++;
+ $bigfirst++;
+ print BIG $p{'raw'};
+ }
+ } elsif (scalar(@exons) == 0) {
+ # Last intron
+ #
+ if (($thisC->{'cDNAend'} - $thisC->{'cDNAstart'}) < 50) {
+ $sma++;
+ $smalast++;
+ print SMA $p{'raw'};
+ if ($p{'matchOrientation'} eq "forward") {
+ $lf++;
+ } else {
+ $lc++;
+ }
+ } else {
+ $big++;
+ $biglast++;
+ print BIG $p{'raw'};
+ }
+ } else {
+ # Interior intron
+ #
+ $interiorintron++;
+ }
+
+ print "int: $interiorintron sma: $sma(First:$smafirst,Last:$smalast,";
+ print "oneF:$smaoneintronF,oneL:$smaoneintronL,oneB:$smaoneintronB) -- big ";
+ print "$big(First:$bigfirst,Last:$biglast,One:$bigoneintron) -- tot $tot -- ";
+ print "ff=$ff,fc=$fc lf=$lf,lc=$lc\n";
+ }
+
+ $firstintron = 0;
+
+ $lastC = $thisC;
+
+ goto loop if (scalar(@exons) > 0);
+ }
+
+}
+
+close(SMA);
+close(BIG);
+close(SMAO);
+close(BIGO);
diff --git a/sim4dbutils/cleanPolishes.C b/sim4dbutils/cleanPolishes.C
new file mode 100644
index 0000000..988178b
--- /dev/null
+++ b/sim4dbutils/cleanPolishes.C
@@ -0,0 +1,503 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "bio++.H"
+#include "sim4.H"
+
+//#define MIN_EXON_LENGTH 50
+//#define MIN_PERCENT_IDENTITY 88
+
+#define MIN_EXON_LENGTH 20
+#define MIN_PERCENT_IDENTITY 90
+
+
+bool
+lowComplexityExon(char *s) {
+ int cnt[5][5] = { {0,0,0,0,0}, {0,0,0,0,0}, {0,0,0,0,0}, {0,0,0,0,0}, {0,0,0,0,0}};
+ int map[256] = {0};
+ int i, j, l = 0;
+ int a=0, b=0, c=0;
+ double qual = 0.0;
+
+ if (s == 0L)
+ return(false);
+
+ map['A'] = map['a'] = 1;
+ map['C'] = map['c'] = 2;
+ map['G'] = map['g'] = 3;
+ map['T'] = map['t'] = 4;
+
+ for (i=0, j=1, l=0; s[j]; i++, j++, l++)
+ cnt[map[s[i]]][map[s[j]]]++;
+
+ if (l > MIN_EXON_LENGTH)
+ return(false);
+
+ for (i=0; i<5; i++) {
+ for (j=0; j<5; j++) {
+ if (a < cnt[i][j]) {
+ c = b;
+ b = a;
+ a = cnt[i][j];
+ } else if (b < cnt[i][j]) {
+ c = b;
+ b = cnt[i][j];
+ } else if (c < cnt[i][j]) {
+ c = cnt[i][j];
+ }
+ }
+ }
+
+ qual = (double)(a+b+c) / (double)(l);
+
+ return(qual > 0.75);
+}
+
+
+// Delete exons before/after a specific intron.
+//
+void
+trimExonsBefore(int intronSplit, sim4polish *p) {
+ for (int i=0; i<intronSplit; i++)
+ p->s4p_deleteExon(0);
+}
+
+void
+trimExonsAfter(int intronSplit, sim4polish *p) {
+ for (int i=p->_numExons-1; i>=intronSplit; i--)
+ p->s4p_deleteExon(i);
+}
+
+
+
+int
+main(int argc, char ** argv) {
+ int totMatches = 0;
+ int oneExon = 0;
+ int smaIntron = 0;
+ int junkFirst = 0;
+ int junkLast = 0;
+ int junkBoth = 0;
+ int splitOnGap = 0;
+ int goodQual = 0;
+ int probGood = 0;
+
+ bool filter = true;
+ bool saveJunk = false;
+ uint32 intronLimit = 100000;
+
+ // Before / after files
+ //
+ bool beforeafter = false;
+#if 0
+ sim4polishWriter *splGood = 0L;
+ sim4polishWriter *splProbGood = 0L;
+#endif
+ sim4polishWriter *splJunkLeft = 0L;
+ sim4polishWriter *splJunkRight = 0L;
+ sim4polishWriter *splJunkBoth = 0L;
+ sim4polishWriter *splIntronGap = 0L;
+
+ // Segregation files
+ //
+ bool segregate = false;
+#if 0
+ sim4polishWriter *filtOne = 0L;
+ sim4polishWriter *filtAllSmall = 0L;
+#endif
+ sim4polishWriter *filtGood = 0L;
+ sim4polishWriter *filtProbGood = 0L;
+ sim4polishWriter *filtJunkLeft = 0L;
+ sim4polishWriter *filtJunkRight = 0L;
+ sim4polishWriter *filtJunkBoth = 0L;
+ sim4polishWriter *filtIntronGap = 0L;
+
+ sim4polishStyle style = sim4polishStyleDefault;
+
+ bool hasBeenWarned = false;
+
+ bool beVerbose = false;
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-threshold", 2) == 0) {
+ intronLimit = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-quiet", 2) == 0) {
+ fprintf(stderr, "QUIET MODE ENABLED -- non-modified matches not output!\n");
+ filter = false;
+
+ } else if (strncmp(argv[arg], "-beforeafter", 2) == 0) {
+ fprintf(stderr, "DEBUG MODE ENABLED -- many 'spl.*' files created!\n");
+ beforeafter = true;
+
+ } else if (strncmp(argv[arg], "-segregate", 3) == 0) {
+ fprintf(stderr, "SEGREGATION MODE ENABLED -- many 'filt.*' files created!\n");
+ segregate = true;
+
+ } else if (strncmp(argv[arg], "-gff3", 5) == 0) {
+ style = sim4polishGFF3;
+
+ } else if (strncmp(argv[arg], "-savejunk", 3) == 0) {
+ saveJunk = true;
+
+ } else if (strncmp(argv[arg], "-verbose", 2) == 0) {
+ beVerbose = true;
+
+ } else {
+ err++;
+ }
+
+ arg++;
+ }
+ if ((err) ||
+ (isatty(fileno(stdin))) ||
+ (isatty(fileno(stdout)) && filter)) {
+ fprintf(stderr, "usage: %s [-threshold t] [-savejunk] [-gff3] [-quiet] [-debug]\n", argv[0]);
+ fprintf(stderr, " -threshold Introns bigger than this are candidates for trimming (default = 100000).\n");
+ fprintf(stderr, " -quiet Don't print unmodified matches\n");
+ fprintf(stderr, " -beforeafter Save (in separate files) the before/after of each modified match\n");
+ fprintf(stderr, " -segregate Save (in separate files) the after of each modified match\n");
+ fprintf(stderr, " -gff3 Write output in GFF3 format\n");
+ fprintf(stderr, " -savejunk Also print the trimmed pieces (as separate matches)\n");
+
+ if (isatty(fileno(stdin)))
+ fprintf(stderr, "error: I cannot read polishes from the terminal!\n");
+
+ if (isatty(fileno(stdout)) && filter)
+ fprintf(stderr, "error: Please redirect the polishes (stdout) to a file.\n");
+
+ exit(1);
+ }
+
+ if (beVerbose)
+ fprintf(stderr, "A big intron is one that is at least "uint32FMT"bp long.\n", intronLimit);
+
+ if (beforeafter) {
+#if 0
+ splGood = new sim4polishWriter("spl.good", style);
+ splProbGood = new sim4polishWriter("spl.probGood", style);
+#endif
+ splJunkLeft = new sim4polishWriter("spl.junkLeft", style);
+ splJunkRight = new sim4polishWriter("spl.junkRight", style);
+ splJunkBoth = new sim4polishWriter("spl.junkBoth", style);
+ splIntronGap = new sim4polishWriter("spl.intronGap", style);
+ }
+
+ if (segregate) {
+#if 0
+ filtOne = new sim4polishWriter("filt.filtOne", style);
+ filtAllSmall = new sim4polishWriter("filt.allSmall", style);
+#endif
+ filtGood = new sim4polishWriter("filt.good", style);
+ filtProbGood = new sim4polishWriter("filt.probGood", style);
+ filtJunkLeft = new sim4polishWriter("filt.junkLeft", style);
+ filtJunkRight = new sim4polishWriter("filt.junkRight", style);
+ filtJunkBoth = new sim4polishWriter("filt.junkBoth", style);
+ filtIntronGap = new sim4polishWriter("filt.intronGap", style);
+ }
+
+ sim4polishWriter *W = new sim4polishWriter("-", style);
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ if (R->getsim4polishStyle() != style)
+ fprintf(stderr, "warning: input format and output format differ.\n");
+
+ while (R->nextAlignment(p)) {
+ uint32 exA;
+ uint32 exB;
+
+ if (p->_numExons == 1) {
+ oneExon++;
+ if (filter)
+ W->writeAlignment(p);
+#if 0
+ if (segregate)
+ filtOneExon->writeAlignment(p);
+#endif
+ } else {
+
+ // Find the big intron. We assume there is only one big intron.
+ //
+ uint32 biggestIntron = 0;
+ uint32 intronSplit = 0;
+ uint32 intronOri = 0;
+
+ for (exA=0, exB=1; exB < p->_numExons; exA++, exB++) {
+ uint32 dist = p->_exons[exB]._genFrom - p->_exons[exA]._genTo + 1;
+ if (dist > biggestIntron) {
+ biggestIntron = dist;
+ intronSplit = exB;
+ intronOri = p->_exons[exA]._intronOrientation;
+ }
+ }
+
+ if (intronOri == 0) {
+ fprintf(stderr, "didn't find the largest intron? (got zero)?\n");
+ exit(1);
+ }
+
+ if (intronOri == SIM4_INTRON_NONE) {
+ fprintf(stderr, "biggest intron isn't an intron? (got none)?\n");
+ exit(1);
+ }
+
+ if (biggestIntron < intronLimit) {
+ smaIntron++;
+ if (filter)
+ W->writeAlignment(p);
+#if 0
+ if (segregate)
+ filtAllSmall->writeAlignment(p);
+#endif
+ } else {
+
+ // Declare the split obvious if all exons on either side are
+ // below MIN_EXON_LENGTH, difficult otherwise.
+ //
+ bool killFirst = true;
+ bool killLast = true;
+
+ for (uint32 i=0; i<intronSplit; i++)
+ if ((p->_exons[i]._estTo - p->_exons[i]._estFrom + 1 >= MIN_EXON_LENGTH) &&
+ (p->_exons[i]._percentIdentity >= MIN_PERCENT_IDENTITY) &&
+ (lowComplexityExon(p->_exons[i]._estAlignment) == false))
+ killFirst = false;
+
+ for (uint32 i=intronSplit; i<p->_numExons; i++)
+ if ((p->_exons[i]._estTo - p->_exons[i]._estFrom + 1 >= MIN_EXON_LENGTH) &&
+ (p->_exons[i]._percentIdentity >= MIN_PERCENT_IDENTITY) &&
+ (lowComplexityExon(p->_exons[i]._estAlignment) == false))
+ killLast = false;
+
+
+ // Sometimes, all exons look crappy. If they have a large
+ // intron too, just kill the match.
+ //
+ if ((killFirst == true) && (killLast == true)) {
+ junkBoth++;
+
+ if ((hasBeenWarned == false) &&
+ ((p->_exons[0]._estAlignment == 0L) || (p->_exons[0]._genAlignment == 0L))) {
+ hasBeenWarned = true;
+ fprintf(stderr, "cleanPolishes: Need alignments to recompute scores correctly!\n");
+ }
+
+ sim4polish *a = new sim4polish(p);
+ sim4polish *b = new sim4polish(p);
+ trimExonsAfter(intronSplit, a);
+ trimExonsBefore(intronSplit, b);
+
+ if (filter && saveJunk) {
+ W->writeAlignment(a);
+ W->writeAlignment(b);
+ }
+
+ if (beforeafter) {
+ //fprintf(splJunkBoth, "====================\n");
+ splJunkBoth->writeAlignment(p);
+ splJunkBoth->writeAlignment(a);
+ splJunkBoth->writeAlignment(b);
+ }
+
+ if (segregate) {
+ filtJunkBoth->writeAlignment(a);
+ filtJunkBoth->writeAlignment(b);
+ }
+
+ delete a;
+ delete b;
+ }
+
+ // If the first half (before the big intron) is crappy, delete
+ // those exons.
+ //
+ if ((killFirst == true) && (killLast == false)) {
+ junkFirst++;
+
+ sim4polish *a = new sim4polish(p);
+ sim4polish *b = new sim4polish(p);
+ trimExonsAfter(intronSplit, a);
+ trimExonsBefore(intronSplit, b);
+
+ if (filter) {
+ if (saveJunk)
+ W->writeAlignment(a);
+ W->writeAlignment(b);
+ }
+
+ if (beforeafter) {
+ //fprintf(splJunkLeft, "====================\n");
+ splJunkLeft->writeAlignment(p);
+ splJunkLeft->writeAlignment(a);
+ splJunkLeft->writeAlignment(b);
+ }
+
+ if (segregate) {
+ filtJunkLeft->writeAlignment(a);
+ filtJunkLeft->writeAlignment(b);
+ }
+
+ delete a;
+ delete b;
+ }
+
+ if ((killFirst == false) && (killLast == true)) {
+ junkLast++;
+
+ sim4polish *a = new sim4polish(p);
+ sim4polish *b = new sim4polish(p);
+ trimExonsAfter(intronSplit, a);
+ trimExonsBefore(intronSplit, b);
+
+ if (filter) {
+ W->writeAlignment(a);
+ if (saveJunk)
+ W->writeAlignment(b);
+ }
+
+ if (beforeafter) {
+ //fprintf(splJunkRight, "====================\n");
+ splJunkRight->writeAlignment(p);
+ splJunkRight->writeAlignment(a);
+ splJunkRight->writeAlignment(b);
+ }
+
+ if (segregate) {
+ filtJunkRight->writeAlignment(a);
+ filtJunkRight->writeAlignment(b);
+ }
+
+ delete a;
+ delete b;
+ }
+
+ if ((killFirst == false) && (killLast == false)) {
+ if (intronOri == SIM4_INTRON_GAP) {
+ splitOnGap++;
+
+ // Break the polish into two pieces, one before and one
+ // after the large intron. This is done by copying the
+ // entire polish, then deleting one half from each.
+ //
+ // XXX If we want to update the strand prediction of the
+ // split pieces, we should
+ //
+ // a) make sure that all the intron signals agree
+ // b) make sure that the percent identites of each exon are > 90%
+ //
+ // For now, we don't.
+
+ sim4polish *a = new sim4polish(p);
+ sim4polish *b = new sim4polish(p);
+ trimExonsBefore(intronSplit, a);
+ trimExonsAfter(intronSplit, b);
+
+ if (filter) {
+ W->writeAlignment(a);
+ W->writeAlignment(b);
+ }
+
+ if (beforeafter) {
+ //fprintf(splIntronGap, "====================\n");
+ splIntronGap->writeAlignment(p);
+ splIntronGap->writeAlignment(a);
+ splIntronGap->writeAlignment(b);
+ }
+
+ if (segregate) {
+ filtIntronGap->writeAlignment(a);
+ filtIntronGap->writeAlignment(b);
+ }
+
+ delete a;
+ delete b;
+ } else {
+
+ // If there is a valid strand prediction and
+ // a) all exons >= 90%
+ // b) all exons >= 95%
+ // c) all exons >= 95%, except first and last, which can be >= 90%
+ // save the match as is.
+ //
+
+ bool qualIsC = ((p->_exons[0]._percentIdentity >= 90) &&
+ (p->_exons[p->_numExons-1]._percentIdentity >= 90));
+
+ for (exA=1; exA < p->_numExons-1; exA++)
+ if (p->_exons[exA]._percentIdentity < 95)
+ qualIsC = false;
+
+ // If the match looks good, but just has a large intron, keep it.
+ //
+ if (qualIsC &&
+ ((p->_strandOrientation == SIM4_STRAND_POSITIVE) ||
+ (p->_strandOrientation == SIM4_STRAND_NEGATIVE))) {
+ goodQual++;
+ if (filter)
+ W->writeAlignment(p);
+ if (segregate)
+ filtGood->writeAlignment(p);
+ } else {
+ probGood++;
+ if (filter)
+ W->writeAlignment(p);
+ if (segregate)
+ filtProbGood->writeAlignment(p);
+ }
+ }
+ }
+
+ } // Has a big intron
+ } // More than one exon
+
+ totMatches++;
+ }
+
+ delete R;
+ delete W;
+
+ if (beforeafter) {
+#if 0
+ delete splGood;
+ delete splProbGood;
+#endif
+ delete splJunkLeft;
+ delete splJunkRight;
+ delete splJunkBoth;
+ delete splIntronGap;
+ }
+
+ if (segregate) {
+#if 0
+ delete filtOne;
+ delete filtAllSmall;
+#endif
+ delete filtGood;
+ delete filtProbGood;
+ delete filtJunkLeft;
+ delete filtJunkRight;
+ delete filtJunkBoth;
+ delete filtIntronGap;
+ }
+
+ if (beVerbose) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "oneExon: %7d\n", oneExon);
+ fprintf(stderr, "allSmallIntrons: %7d\n", smaIntron);
+ fprintf(stderr, "good: %7d\n", goodQual);
+ fprintf(stderr, "probably good: %7d\n", probGood);
+ fprintf(stderr, "junkExonsLeft: %7d\n", junkFirst);
+ fprintf(stderr, "junkExonsRight: %7d\n", junkLast);
+ fprintf(stderr, "junkExonsBoth: %7d\n", junkBoth);
+ fprintf(stderr, "intronOnGap: %7d\n", splitOnGap);
+ fprintf(stderr, "total: %7d\n", totMatches);
+ }
+
+ return(0);
+}
diff --git a/sim4dbutils/comparePolishes.C b/sim4dbutils/comparePolishes.C
new file mode 100644
index 0000000..a3e11c9
--- /dev/null
+++ b/sim4dbutils/comparePolishes.C
@@ -0,0 +1,524 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include "bio++.H"
+#include "sim4.H"
+#include "s4p_overlap.H"
+
+// Matches two sets of polishes to each other using a simple overlap
+// heuristic.
+//
+// Arguments (are horrible, whatcha gonna do about it?):
+//
+// -i min percent id (default 95)
+// -c min percent coverage (default 50)
+// -a polishes input file 1
+// -b polishes input file 2
+// -gff3 write output as GFF3
+//
+// Output is on standard out, and is tab-delimited. It reports
+// stuff about the 'same' matches:
+//
+// ESTiid ESTlen overlap A%id A%cov #cdnagaps #exons B%id B%cov #cdnagaps #exons
+
+// Try to analyze cDNA gaps.
+//
+// For cDNA gaps larger than GAP_MINIMUM, count it as a gap only if
+// the genomic gap is within GAP_DIFFERENCE of the cDNA gap.
+//
+// XXX This needs some tweaking!
+//
+#define GAP_MINIMUM 10
+#define GAP_DIFFERENCE 4
+
+
+sim4polishWriter *
+openOutput(const char *prefix, const char *suffix, sim4polishStyle style) {
+ char name[FILENAME_MAX];
+ sprintf(name, "%s.%s", prefix, suffix);
+ return(new sim4polishWriter(name, style));
+}
+
+
+
+int
+main(int argc, char **argv) {
+ uint32 minI = 95;
+ uint32 minC = 50;
+ const char *prefix = "comparePolishes";
+ sim4polishFile *Afile = 0L;
+ sim4polishFile *Bfile = 0L;
+
+ // goodOverlap -- match in A maps uniquely to B and likewise.
+ //
+ // novelInA -- a match in A has no counterpart in B.
+ // novelInB -- similar for B.
+ //
+ // multipleInA -- a match in B maps to multiple things in A.
+ // multipleInB -- similar for A.
+ //
+ // multipleInA requires that the matches in A map only to the single
+ // match in B.
+ //
+ // hairyOverlap -- multiple matches in both.
+ //
+ uint32 goodOverlap = 0; // the number of lines in the output
+ uint32 novelInA = 0;
+ uint32 novelInB = 0;
+ uint32 multipleInA = 0;
+ uint32 multipleInB = 0;
+ uint32 hairyOverlap = 0;
+
+ bool doGFF3;
+
+ sim4polishStyle Astyle = sim4polishStyleDefault;
+ sim4polishStyle Bstyle = sim4polishStyleDefault;
+
+ sim4polishStyle style = sim4polishStyleDefault;
+
+
+ int arg=1;
+ while(arg < argc) {
+ if (strcmp(argv[arg], "-i") == 0) {
+ minI = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-c") == 0) {
+ minC = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-a") == 0) {
+ // Ugly hack to obtain the style of the input files, but can be fixed later
+
+ sim4polishReader *AR = new sim4polishReader(argv[++arg]);
+ Astyle = AR->getsim4polishStyle();
+ delete AR;
+
+ Afile = new sim4polishFile(argv[arg], Astyle);
+ } else if (strcmp(argv[arg], "-b") == 0) {
+ // Ugly hack to obtain the style of the input files, but can be fixed later
+
+ sim4polishReader *BR = new sim4polishReader(argv[++arg]);
+ Bstyle = BR->getsim4polishStyle();
+ delete BR;
+
+ Bfile = new sim4polishFile(argv[arg], Bstyle);
+ } else if (strcmp(argv[arg], "-p") == 0) {
+ prefix = argv[++arg];
+ } else if (strcmp(argv[arg], "-gff3") == 0) {
+ doGFF3 = true;
+ style = sim4polishGFF3;
+ }
+ arg++;
+ }
+
+ if ((Afile == 0L) || (Bfile == 0L)) {
+ fprintf(stderr, "usage: %s [-i percent-identity] [-c percent-coverage] -a input-set-a -b input-set-b [-p output-prefix] [-gff3]\n", argv[0]);
+ fprintf(stderr, "only -a and -b are mandatory, but you should give all anyway\n");
+ exit(1);
+ }
+
+ // Open the output files
+ //
+ sim4polishWriter *fasame = openOutput(prefix, "a-same", style);
+ sim4polishWriter *fbsame = openOutput(prefix, "b-same", style);
+ sim4polishWriter *fanovel = openOutput(prefix, "a-novel", style);
+ sim4polishWriter *fbnovel = openOutput(prefix, "b-novel", style);
+ sim4polishWriter *famulti = openOutput(prefix, "a-multi", style);
+ sim4polishWriter *fbmulti = openOutput(prefix, "b-multi", style);
+ sim4polishWriter *fhairy = openOutput(prefix, "hairy", style);
+
+ // Force index builds
+ //
+ Afile->setPosition(0);
+ Bfile->setPosition(0);
+
+
+ // Find the largest IID
+ //
+ uint32 largestIID = Afile->maxIID();
+ if (largestIID < Bfile->maxIID())
+ largestIID = Bfile->maxIID();
+
+
+ // Iterate over all the ESTs.
+
+ for (uint32 iid=0; iid<largestIID; iid++) {
+ sim4polishList *A = Afile->getEST(iid);
+ sim4polishList *B = Bfile->getEST(iid);
+ sim4polishList *Ta = 0L;
+ sim4polishList *Tb = 0L;
+
+ // Filter by quality.
+ A->filterByQuality(minI, minC);
+ B->filterByQuality(minI, minC);
+
+ // fill out the overlap matrix
+
+ olap_t **overlap = new olap_t* [A->length()];
+ overlap[0] = new olap_t [A->length() * B->length()];
+ for (uint32 i=1; i<A->length(); i++)
+ overlap[i] = overlap[i-1] + B->length();
+
+ for (uint32 a=0; a<A->length(); a++)
+ for (uint32 b=0; b<B->length(); b++)
+ overlap[a][b] = findOverlap((*A)[a], (*B)[b]);
+
+
+
+ // Find and remove those matches that are unique to either set.
+ // Removing is a big pain, because we either have to know
+ // something about the removal process, or we need to rebuild the
+ // overlap matrix after each removal. Instead, we build a new set.
+
+ bool *removeA = new bool [A->length()];
+ bool *removeB = new bool [B->length()];
+
+ for (uint32 a=0; a<A->length(); a++)
+ removeA[a] = false;
+
+ for (uint32 b=0; b<B->length(); b++)
+ removeB[b] = false;
+
+
+ for (uint32 a=0; a<A->length(); a++) {
+ uint32 ovl = 0;
+
+ for (uint32 b=0; b<B->length(); b++)
+ if (overlap[a][b])
+ ovl++;
+
+ if (ovl == 0) {
+ removeA[a] = true;
+ novelInA++;
+
+ if (fanovel)
+ fanovel->writeAlignment((*A)[a]);
+ }
+ }
+
+ for (uint32 b=0; b<B->length(); b++) {
+ uint32 ovl = 0;
+
+ for (uint32 a=0; a<A->length(); a++)
+ if (overlap[a][b])
+ ovl++;
+
+ if (ovl == 0) {
+ removeB[b] = true;
+ novelInB++;
+
+ if (fbnovel)
+ fbnovel->writeAlignment((*B)[b]);
+ }
+ }
+
+ //
+ // Now find all those that are perfect matches. Yeah, yeah, we
+ // could ignore those that we already marked for removal.
+ //
+
+ for (uint32 a=0; a<A->length(); a++) {
+ uint32 Boverlaps = 0;
+ uint32 theBovl = 0;
+
+ // Count the number of things we overlap in B.
+ for (uint32 b=0; b<B->length(); b++) {
+ if (overlap[a][b]) {
+ Boverlaps++;
+ theBovl = b;
+ }
+ }
+
+ // If exactly one overlap, we just need to check if the guy in B
+ // also has one overlap with anybody in A.
+
+ if (Boverlaps == 1) {
+
+ // Count the number of overlaps the guy in B has with A. If
+ // 1, it's a goodOverlap, else it's a multipleInA.
+
+ uint32 b = theBovl;
+
+ uint32 Aoverlaps = 0;
+ for (uint32 x=0; x<A->length(); x++)
+ if (overlap[x][b])
+ Aoverlaps++;
+
+ if (Aoverlaps == 1) {
+ removeA[a] = true;
+ removeB[b] = true;
+ goodOverlap++;
+
+ // ESTiid ESTlen overlap A%id A%cov AgenLen #exons #cdnagaps B%id B%cov BgenLen #exons #cdnagaps
+
+ uint32 AgenLen = 0, BgenLen = 0;
+ uint32 Agaps = 0, Bgaps = 0;
+
+ for (uint32 x=0; x < (*A)[a]->_numExons; x++)
+ AgenLen += (*A)[a]->_exons[x]._genTo - (*A)[a]->_exons[x]._genFrom + 1;
+
+ for (uint32 x=0; x < (*B)[b]->_numExons; x++)
+ BgenLen += (*B)[b]->_exons[x]._genTo - (*B)[b]->_exons[x]._genFrom + 1;
+
+#ifdef GAP_MINIMUM
+ for (uint32 x=1; x < (*A)[a]->_numExons; x++) {
+ int egap = (*A)[a]->_exons[x]._estFrom - (*A)[a]->_exons[x-1]._estTo;
+ int ggap = (*A)[a]->_exons[x]._genFrom - (*A)[a]->_exons[x-1]._genTo;
+ int dgap = 0;
+
+ if (egap > ggap)
+ dgap = egap - ggap;
+ else
+ dgap = ggap - egap;
+
+ if ((egap > GAP_MINIMUM) &&
+ (dgap < GAP_DIFFERENCE))
+ Agaps++;
+ }
+
+ for (uint32 x=1; x < (*B)[b]->_numExons; x++) {
+ int egap = (*B)[b]->_exons[x]._estFrom - (*B)[b]->_exons[x-1]._estTo;
+ int ggap = (*B)[b]->_exons[x]._genFrom - (*B)[b]->_exons[x-1]._genTo;
+ int dgap = 0;
+
+ if (egap > ggap)
+ dgap = egap - ggap;
+ else
+ dgap = ggap - egap;
+
+ if ((egap > GAP_MINIMUM) &&
+ (dgap < GAP_DIFFERENCE))
+ Bgaps++;
+ }
+#else
+ for (uint32 x=1; x < (*A)[a]->_numExons; x++)
+ if ( (*A)[a]->_exons[x]._estFrom - (*A)[a]->_exons[x-1]._estTo != 1 )
+ Agaps++;
+
+ for (uint32 x=1; x < (*B)[b]->_numExons; x++)
+ if ( (*B)[b]->_exons[x]._estFrom - (*B)[b]->_exons[x-1]._estTo != 1 )
+ Bgaps++;
+#endif
+
+ double score = 0;
+ if (AgenLen > BgenLen)
+ score = (double)overlap[a][b] / (double)BgenLen;
+ else
+ score = (double)overlap[a][b] / (double)AgenLen;
+
+ fprintf(stdout, uint32FMT"\t"uint32FMT"\t"OLAPTFMT"\t%f\t%8.3f\t%8.3f\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%8.3f\t%8.3f\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\n",
+ iid,
+ (*A)[a]->_estLen,
+ overlap[a][b],
+ score,
+ (*A)[a]->s4p_percentIdentityExact(),
+ (*A)[a]->s4p_percentCoverageExact(),
+ AgenLen, (*A)[a]->_numExons, Agaps,
+ (*B)[b]->s4p_percentIdentityExact(),
+ (*B)[b]->s4p_percentCoverageExact(),
+ BgenLen, (*B)[b]->_numExons, Bgaps);
+
+ if (fasame)
+ fasame->writeAlignment((*A)[a]);
+ if (fbsame)
+ fbsame->writeAlignment((*B)[b]);
+ }
+ }
+ }
+
+ //
+ // Rebuild
+ //
+
+ Ta = new sim4polishList;
+ Tb = new sim4polishList;
+
+ for (uint32 a=0; a<A->length(); a++)
+ if (removeA[a] == false)
+ Ta->push(new sim4polish((*A)[a]));
+
+ for (uint32 b=0; b<B->length(); b++)
+ if (removeB[b] == false)
+ Tb->push(new sim4polish((*B)[b]));
+
+ delete A;
+ delete B;
+ A = Ta;
+ B = Tb;
+ Ta = Tb = 0L;
+
+ // Rebuild overlaps
+ //
+ for (uint32 a=0; a<A->length(); a++)
+ for (uint32 b=0; b<B->length(); b++)
+ overlap[a][b] = findOverlap((*A)[a], (*B)[b]);
+
+
+ //
+ // And now all we're left with is a bunch of intersecting crud.
+ //
+
+
+ // Grab the first match in A. Find all the overlaps with things
+ // in B. For each of those, find the overlaps in A. Repeat
+ // until nothing changes. Generate a report. Remove all those
+ // matches. Do it all again until there are no more matches.
+
+ while (A->length()) {
+ for (uint32 a=0; a<A->length(); a++)
+ removeA[a] = false;
+
+ for (uint32 b=0; b<B->length(); b++)
+ removeB[b] = false;
+
+ removeA[0] = true;
+
+ bool keepGoing = true;
+
+ while (keepGoing) {
+ keepGoing = false;
+
+ // For all of A, if we have something marked for removal, see if we
+ // overlap with anything in B. If that b is not marked for removal,
+ // mark it, and keep going.
+ //
+ for (uint32 a=0; a<A->length(); a++) {
+ if (removeA[a]) {
+ for (uint32 b=0; b<B->length(); b++) {
+ if ((overlap[a][b]) && (removeB[b] == false)) {
+ removeB[b] = true;
+ keepGoing = true;
+ }
+ }
+ }
+ }
+
+ // Same thing, but for B.
+ //
+ for (uint32 b=0; b<B->length(); b++) {
+ if (removeB[b]) {
+ for (uint32 a=0; a<A->length(); a++) {
+ if ((overlap[a][b]) && (removeA[a] == false)) {
+ removeA[a] = true;
+ keepGoing = true;
+ }
+ }
+ }
+ }
+ }
+
+ // Found a component. Output it.
+
+ uint32 inA = 0;
+ uint32 inB = 0;
+
+ for (uint32 a=0; a<A->length(); a++)
+ if (removeA[a])
+ inA++;
+ for (uint32 b=0; b<B->length(); b++)
+ if (removeB[b])
+ inB++;
+
+ if ((inA > 1) && (inB > 1)) {
+ hairyOverlap++;
+
+ //fprintf(fhairy, "EST="uint32FMT" "uint32FMT" "uint32FMT"\n", (*A)[0]->_estID, inA, inB);
+ for (uint32 a=0; a<A->length(); a++)
+ if (removeA[a])
+ fhairy->writeAlignment((*A)[a]);
+ for (uint32 b=0; b<B->length(); b++)
+ if (removeB[b])
+ fhairy->writeAlignment((*B)[b]);
+ } else if ((inA == 1) && (inB > 1)) {
+ multipleInB++;
+
+ //fprintf(fbmulti, "EST="uint32FMT" "uint32FMT" "uint32FMT"\n", (*A)[0]->_estID, inA, inB);
+ for (uint32 a=0; a<A->length(); a++)
+ if (removeA[a])
+ fbmulti->writeAlignment((*A)[a]);
+ for (uint32 b=0; b<B->length(); b++)
+ if (removeB[b])
+ fbmulti->writeAlignment((*B)[b]);
+ } else if ((inA > 1) && (inB == 1)) {
+ multipleInA++;
+
+ //fprintf(famulti, "EST="uint32FMT" "uint32FMT" "uint32FMT"\n", (*A)[0]->_estID, inA, inB);
+ for (uint32 a=0; a<A->length(); a++)
+ if (removeA[a])
+ famulti->writeAlignment((*A)[a]);
+ for (uint32 b=0; b<B->length(); b++)
+ if (removeB[b])
+ famulti->writeAlignment((*B)[b]);
+ } else {
+ fprintf(stderr, "ERROR! inA="uint32FMT" inB="uint32FMT"\n", inA, inB);
+ }
+
+ //
+ // Rebuild
+ //
+
+ Ta = new sim4polishList;
+ Tb = new sim4polishList;
+
+ for (uint32 a=0; a<A->length(); a++)
+ if (removeA[a] == false)
+ Ta->push(new sim4polish((*A)[a]));
+
+ for (uint32 b=0; b<B->length(); b++)
+ if (removeB[b] == false)
+ Tb->push(new sim4polish((*B)[b]));
+
+ delete A;
+ delete B;
+ A = Ta;
+ B = Tb;
+ Ta = Tb = 0L;
+
+ // Rebuild overlaps
+ //
+ for (uint32 a=0; a<A->length(); a++)
+ for (uint32 b=0; b<B->length(); b++)
+ overlap[a][b] = findOverlap((*A)[a], (*B)[b]);
+ }
+
+ if ((iid % 100) == 0) {
+ fprintf(stderr, "IID:"uint32FMTW(8)" good:"uint32FMTW(4)" Anovel:"uint32FMTW(4)" Amulti:"uint32FMTW(4)" Bnovel:"uint32FMTW(4)" Bmulti:"uint32FMTW(4)" hairy:"uint32FMTW(4)"\r",
+ iid,
+ goodOverlap, novelInA, multipleInA, novelInB, multipleInB, hairyOverlap);
+ fflush(stderr);
+ }
+
+#if 0
+ if ((iid % 1234) == 0) {
+ fprintf(stderr, "IID:"uint32FMTW(8)" good:"uint32FMTW(4)" Anovel:"uint32FMTW(4)" Amulti:"uint32FMTW(4)" Bnovel:"uint32FMTW(4)" Bmulti:"uint32FMTW(4)" hairy:"uint32FMTW(4)"\r",
+ iid,
+ goodOverlap, novelInA, multipleInA, novelInB, multipleInB, hairyOverlap);
+ fflush(stderr);
+ }
+#endif
+
+ delete [] overlap[0];
+ delete [] overlap;
+
+ delete [] removeA;
+ delete [] removeB;
+
+ delete A;
+ delete B;
+ }
+
+ delete fasame;
+ delete fbsame;
+ delete fanovel;
+ delete fbnovel;
+ delete famulti;
+ delete fbmulti;
+ delete fhairy;
+
+ delete Afile;
+ delete Bfile;
+
+ fprintf(stderr, "\ngood:"uint32FMTW(4)" Anovel:"uint32FMTW(4)" Amulti:"uint32FMTW(4)" Bnovel:"uint32FMTW(4)" Bmulti:"uint32FMTW(4)" hairy:"uint32FMTW(4)"\n",
+ goodOverlap, novelInA, multipleInA, novelInB, multipleInB, hairyOverlap);
+
+ exit(0);
+}
diff --git a/sim4dbutils/convertPolishes.C b/sim4dbutils/convertPolishes.C
new file mode 100644
index 0000000..b395e7f
--- /dev/null
+++ b/sim4dbutils/convertPolishes.C
@@ -0,0 +1,57 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "bio.h"
+#include "sim4.H"
+
+int
+main(int argc, char ** argv) {
+ sim4polishWriter *GOOD = 0L;
+ sim4polishStyle in_style, out_style;
+
+ // We limit scaffolds to be below the number of open files per
+ // process.
+ //
+
+ if (argc != 1) {
+ fprintf(stderr, "S4DB to GFF3 format converter.\nUsage: %s < input_file > output_file\n", argv[0]);
+ exit(1);
+ }
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ in_style = R->getsim4polishStyle();
+
+ if (in_style == sim4polishS4DB)
+ out_style = sim4polishGFF3;
+ else if (in_style == sim4polishGFF3)
+ out_style = sim4polishS4DB;
+ else {
+ fprintf(stderr, "ERROR: Unrecognized or unsupported polishes format. Aborting.\n"); exit(1);
+ }
+
+ if (GOOD == 0L)
+ GOOD = new sim4polishWriter("-", out_style);
+
+ while (R->nextAlignment(p)) {
+
+#if 0
+ if (noDefLines)
+ p->s4p_removeDefLines();
+ if (noAlignments)
+ p->s4p_removeAlignments();
+#endif
+
+ GOOD->writeAlignment(p);
+ }
+
+ delete R;
+
+ delete GOOD;
+
+ return(0);
+}
diff --git a/sim4dbutils/convertToAtac.C b/sim4dbutils/convertToAtac.C
new file mode 100644
index 0000000..92afb59
--- /dev/null
+++ b/sim4dbutils/convertToAtac.C
@@ -0,0 +1,334 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "sim4.H"
+
+// Writes polished from stdin as atac-format matches. Splits polishes on any indel to generate gapless
+// atac matches (type 'u').
+//
+// Does no cleanup.
+
+
+void
+indelRedo(char *a, char *b) {
+ uint32 orig = 0;
+ uint32 copy = 0;
+
+ while (a[orig] && b[orig]) {
+ if ((a[orig] != '-') ||
+ (b[orig] != '-')) {
+ if (orig != copy) {
+ a[copy] = a[orig];
+ b[copy] = b[orig];
+ }
+ copy++;
+ }
+ orig++;
+ }
+
+ a[copy] = 0;
+ b[copy] = 0;
+}
+
+
+uint32
+indelFixAlignment(char *a, char *b) {
+ bool redo = false;
+ uint32 len = strlen(a) - 1;
+ uint32 fixed = 0;
+
+ //fprintf(stdout, "fixIndel\n");
+ //fprintf(stdout, "%s\n%s\n", a, b);
+
+ for (uint32 i=2; i<len; i++) {
+
+ // -Ac
+ // cA- two gaps -> two mismatches
+ if ((a[i-2] == '-') && (b[i] == '-')) {
+ a[i-2] = toUpper[a[i-1]]; a[i-1] = toUpper[a[i]]; a[i] = '-';
+ b[i-2] = toUpper[b[i-2]]; b[i-1] = toUpper[b[i-1]]; b[i] = '-';
+ fixed++;
+ redo = true;
+ }
+
+ if ((a[i] == '-') && (b[i-2] == '-')) {
+ a[i-2] = toUpper[a[i-2]]; a[i-1] = toUpper[a[i-1]]; a[i] = '-';
+ b[i-2] = toUpper[b[i-1]]; b[i-1] = toUpper[b[i]]; b[i] = '-';
+ fixed++;
+ redo = true;
+ }
+ }
+
+ if (redo) {
+ //fprintf(stdout, "%s\n%s\n", a, b);
+ //fprintf(stdout, "Fixed "uint32FMT" 1 base wide indel\n", fixed);
+ indelRedo(a, b);
+ }
+
+ redo = false;
+ len = strlen(a) - 1;
+
+ for (uint32 i=3; i<len; i++) {
+
+ // cAt-
+ // -Agg two gaps, one mismatch -> three mismatches
+ // we also would do two gaps -> three mismatches
+ if ((a[i] == '-') && (b[i-3] == '-')) {
+ a[i-3] = toUpper[a[i-3]]; a[i-2] = toUpper[a[i-2]]; a[i-1] = toUpper[a[i-1]]; a[i] = '-';
+ b[i-3] = toUpper[b[i-2]]; b[i-2] = toUpper[b[i-1]]; b[i-1] = toUpper[b[i]]; b[i] = '-';
+ fixed++;
+ redo = true;
+ }
+
+ if ((a[i-3] == '-') && (b[i] == '-')) {
+ a[i-3] = toUpper[a[i-2]]; a[i-2] = toUpper[a[i-1]]; a[i-1] = toUpper[a[i]]; a[i] = '-';
+ b[i-3] = toUpper[b[i-3]]; b[i-2] = toUpper[b[i-2]]; b[i-1] = toUpper[b[i-1]]; b[i] = '-';
+ fixed++;
+ redo = true;
+ }
+ }
+
+ if (redo) {
+ //fprintf(stdout, "%s\n%s\n", a, b);
+ //fprintf(stdout, "Fixed "uint32FMT" 2 base wide indel\n", fixed);
+ indelRedo(a, b);
+ }
+
+ return(fixed);
+}
+
+
+
+int
+main(int argc, char **argv) {
+ char *nickname1 = 0L, *asmfile1 = 0L;
+ char *nickname2 = 0L, *asmfile2 = 0L;
+ bool flip = false;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-1", 2) == 0) {
+ nickname1 = argv[++arg];
+ asmfile1 = argv[++arg];
+ } else if (strncmp(argv[arg], "-2", 2) == 0) {
+ nickname2 = argv[++arg];
+ asmfile2 = argv[++arg];
+ } else if (strncmp(argv[arg], "-f", 2) == 0) {
+ flip = true;
+ } else {
+ fprintf(stderr, "Unknown arg '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ if ((nickname1 == 0L) || (nickname2 == 0L)) {
+ fprintf(stderr, "usage: %s [-f] -1 nickname1 asmfile1 -2 nickname2 asmfile2 < matches.sim4db > matches.atac\n", argv[0]);
+ exit(1);
+ }
+
+ if (flip == false) {
+ fprintf(stdout, "!format atac 1.0\n");
+ fprintf(stdout, "/assemblyFile1=%s\n", asmfile1);
+ fprintf(stdout, "/assemblyFile2=%s\n", asmfile2);
+ fprintf(stdout, "/assemblyId1=%s\n", nickname1);
+ fprintf(stdout, "/assemblyId2=%s\n", nickname2);
+ } else {
+ fprintf(stdout, "!format atac 1.0\n");
+ fprintf(stdout, "/assemblyFile1=%s\n", asmfile2);
+ fprintf(stdout, "/assemblyFile2=%s\n", asmfile1);
+ fprintf(stdout, "/assemblyId1=%s\n", nickname2);
+ fprintf(stdout, "/assemblyId2=%s\n", nickname1);
+ }
+
+ uint32 dupRecordIID = 0;
+ uint32 dupParentIID = 0;
+
+ uint32 totalFixed = 0;
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+ while (R->nextAlignment(p)) {
+
+ // Parse the defline to find the genomic region our 'est'
+ // (unfortunate sim4db term) is from. Search for our
+ // information in the defline
+ //
+ // extracted from iid (\d+) pos (\d+) (\d+)
+
+ splitToWords W(p->_estDefLine);
+
+ uint32 i=0;
+ while ((i < W.numWords()) && (strcmp(W[i], "iid") != 0))
+ i++;
+ if ((i == 0) || (i == W.numWords()))
+ fprintf(stderr, "Failed to match est defline '%s'\n", p->_estDefLine), exit(1);
+
+ uint32 qSeqIID = strtouint32(W[i+1], 0L);
+ uint32 qSeqBeg = strtouint32(W[i+3], 0L);
+ uint32 qSeqEnd = strtouint32(W[i+4], 0L); // Not used
+
+
+ W.split(p->_genDefLine);
+
+ i=0;
+ while ((i<W.numWords()) && (strcmp(W[i], "iid") != 0))
+ i++;
+ if ((i == 0) || (i == W.numWords()))
+ fprintf(stderr, "Failed to match gen defline '%s'\n", p->_genDefLine), exit(1);
+
+ uint32 gSeqIID = strtouint32(W[i+1], 0L);
+ uint32 gSeqBeg = strtouint32(W[i+3], 0L);
+ //uint32 gSeqEnd = strtouint32(W[i+4], 0L); // Not used
+
+ bool fwd = (p->_matchOrientation == SIM4_MATCH_FORWARD);
+
+
+ // Fix the coords
+ //
+ if (fwd) {
+ // Forward is easy! Just add.
+
+ for (uint32 exon=0; exon<p->_numExons; exon++) {
+ sim4polishExon *e = p->_exons + exon;
+
+ e->_estFrom += qSeqBeg;
+ e->_estTo += qSeqBeg;
+ e->_genFrom += gSeqBeg;
+ e->_genTo += gSeqBeg;
+ }
+ } else {
+ // Reverse is not easy. Need to reverse complement the query positions.
+
+ for (uint32 exon=0; exon<p->_numExons; exon++) {
+ sim4polishExon *e = p->_exons + exon;
+
+ // First, reverse the query relative to our extracted piece
+ //
+ uint32 f = (qSeqEnd - qSeqBeg) - e->_estTo + 2; // Extra +1 to offset -1 when we set qBeg
+ uint32 t = (qSeqEnd - qSeqBeg) - e->_estFrom + 2;
+
+ // Now we can just offset stuff.
+ e->_estFrom = qSeqBeg + t; // Really the end!
+ e->_estTo = qSeqBeg + f; // Really the begin!
+ e->_genFrom += gSeqBeg;
+ e->_genTo += gSeqBeg;
+ }
+ }
+
+
+
+ for (uint32 exon=0; exon<p->_numExons; exon++) {
+ sim4polishExon *e = p->_exons + exon;
+
+ // Parse the alignment to find ungapped blocks
+
+ uint32 aPos = 0;
+
+ uint32 qBeg = e->_estFrom - 1;
+ uint32 gBeg = e->_genFrom - 1;
+
+ uint32 mLen = 0;
+
+ totalFixed += indelFixAlignment(e->_estAlignment, e->_genAlignment);
+
+ // Skip mismatches/gaps at the start of this sequence
+ //
+ while ((e->_estAlignment[aPos] == '-') ||
+ (e->_genAlignment[aPos] == '-') ||
+ (e->_estAlignment[aPos] != e->_genAlignment[aPos])) {
+ if (e->_estAlignment[aPos] != '-')
+ if (fwd) qBeg++;
+ else qBeg--;
+ if (e->_genAlignment[aPos] != '-')
+ gBeg++;
+ //fprintf(stderr, "SKIP BEGIN %c %c\n", e->_estAlignment[aPos], e->_genAlignment[aPos]);
+ aPos++;
+ }
+
+
+ bool notDone = true; // There should be a way to get rid of this stupid variable....
+ while (notDone) {
+ notDone = ((e->_estAlignment[aPos] != 0) &&
+ (e->_genAlignment[aPos] != 0));
+
+ // If we find the end of a gapless block, emit a match
+
+ if ((e->_estAlignment[aPos] == '-') || (e->_estAlignment[aPos] == 0) ||
+ (e->_genAlignment[aPos] == '-') || (e->_genAlignment[aPos] == 0)) {
+
+ // Trim off any mismatches at the end of this block.
+ //
+ uint32 mismatch = 0;
+ while ((aPos > mismatch) &&
+ (e->_estAlignment[aPos - mismatch - 1] != e->_genAlignment[aPos - mismatch - 1])) {
+ //fprintf(stderr, "SKIP MIDDLE %c %c\n", e->_estAlignment[aPos-mismatch], e->_genAlignment[aPos-mismatch]);
+ mismatch++;
+ }
+
+ // If there is an indel at the start (which probably
+ // shouldn't happen anyway!), or possibly at the end,
+ // then our length is zero, and we should not emit
+ // anything.
+ //
+ if (mLen > mismatch) {
+ mLen -= mismatch;
+
+ if (flip == false) {
+ fprintf(stdout, "M u dupr"uint32FMT" dupp"uint32FMT" %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" %s\n",
+ dupRecordIID,
+ dupParentIID,
+ nickname1, qSeqIID, (fwd) ? qBeg : qBeg - mLen, mLen,
+ nickname2, gSeqIID, gBeg, mLen,
+ (fwd) ? "1" : "-1");
+ } else {
+ fprintf(stdout, "M u dupr"uint32FMT" dupp"uint32FMT" %s:"uint32FMT" "uint32FMT" "uint32FMT" 1 %s:"uint32FMT" "uint32FMT" "uint32FMT" %s\n",
+ dupRecordIID,
+ dupParentIID,
+ nickname2, gSeqIID, gBeg, mLen,
+ nickname1, qSeqIID, (fwd) ? qBeg : qBeg - mLen, mLen,
+ (fwd) ? "1" : "-1");
+ }
+ dupRecordIID++;
+
+ mLen += mismatch;
+
+ // Adjust our begin and end positions to the end of this record
+ if (fwd) qBeg += mLen;
+ else qBeg -= mLen;
+ gBeg += mLen;
+
+ mLen = 0;
+ }
+
+ // Skip whatever caused us to emit a gapless block, also skip any mismatches here
+ //
+ while ((e->_estAlignment[aPos] == '-') ||
+ (e->_genAlignment[aPos] == '-') ||
+ (e->_estAlignment[aPos] != e->_genAlignment[aPos])) {
+ if (e->_estAlignment[aPos] != '-')
+ if (fwd) qBeg++;
+ else qBeg--;
+ if (e->_genAlignment[aPos] != '-')
+ gBeg++;
+ //fprintf(stderr, "SKIP END %c %c\n", e->_estAlignment[aPos], e->_genAlignment[aPos]);
+ aPos++;
+ }
+ } else {
+ // Not the end of a gapless block, extend this match by one
+ mLen++;
+ aPos++;
+ }
+
+ } // over all positions in the alignemnt
+ } // over all exons
+
+ dupParentIID++;
+ }
+
+ fprintf(stderr, "Fixed "uint32FMT" indel/mismatches.\n", totalFixed);
+
+ return(0);
+}
+
diff --git a/sim4dbutils/convertToExtent.C b/sim4dbutils/convertToExtent.C
new file mode 100644
index 0000000..79a4ff2
--- /dev/null
+++ b/sim4dbutils/convertToExtent.C
@@ -0,0 +1,132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "sim4.H"
+
+// Writes polishes from stdin as a one-line-per-match format, space-based!
+
+bool extendedFormat = false;
+
+void
+output(sim4polish *p,
+ char *Ep,
+ char *Gp,
+ uint32 a,
+ uint32 b,
+ bool isExon) {
+ uint32 beg = p->_exons[a]._estFrom - 1;
+ uint32 end = p->_exons[b]._estTo;
+
+ if (p->_matchOrientation == SIM4_MATCH_COMPLEMENT) {
+ beg = p->_estLen - beg;
+ end = p->_estLen - end;
+ }
+
+ double ident = p->_exons[a]._percentIdentity;
+ double cover = 0.0;
+
+ // If we're not a single exon, compute the real identity of the whole thing.
+ //
+ if (isExon == false) {
+ if (p->_exons[a]._estAlignment) {
+ ident = p->s4p_percentIdentityExact();
+ cover = p->s4p_percentCoverageExact();
+ } else {
+ ident = p->_percentIdentity;
+ cover = p->_querySeqIdentity;
+ }
+ }
+
+ if (extendedFormat)
+ fprintf(stdout, "%s\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%s\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%6.3f\t%6.3f\n",
+ Ep, p->_estID,
+ p->_estLen, a, beg, end,
+ Gp, p->_genID,
+ p->_exons[a]._genFrom - 1, p->_exons[b]._genTo,
+ ident, cover);
+ else
+ fprintf(stdout, "%s\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%s\t"uint32FMT"\t"uint32FMT"\t%6.3f\t%6.3f\n",
+ Ep, p->_estLen, a, beg, end,
+ Gp, p->_exons[a]._genFrom - 1, p->_exons[b]._genTo,
+ ident, cover);
+}
+
+
+int
+main(int argc, char **argv) {
+ bool beVerbose = false;
+ bool wholeEDefLine = false;
+ bool wholeGDefLine = false;
+ bool doExons = false;
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-v") == 0) {
+ beVerbose = true;
+ } else if (strcmp(argv[arg], "-fullquery") == 0) {
+ wholeEDefLine = true;
+ } else if (strcmp(argv[arg], "-fullgenomic") == 0) {
+ wholeGDefLine = true;
+ } else if (strcmp(argv[arg], "-exons") == 0) {
+ doExons = true;
+ } else if (strcmp(argv[arg], "-extended") == 0) {
+ extendedFormat = true;
+ } else {
+ fprintf(stderr, "Unknown arg '%s'\n", argv[arg]);
+ err++;
+ }
+ arg++;
+ }
+ if (isatty(fileno(stdin)) || (err)) {
+ fprintf(stderr, "usage: %s [options] < IN > OUT\n", argv[0]);
+ fprintf(stderr, " -v be chatty\n");
+ fprintf(stderr, " -fullquery output the whole query def line\n");
+ fprintf(stderr, " -fullgenomic output the whole genomic def line\n");
+ fprintf(stderr, " -exons include exons\n");
+ fprintf(stderr, " -extended include the IDX of each sequence\n");
+ exit(1);
+ }
+
+ if (extendedFormat)
+ fprintf(stdout, "cDNAid\tcDNAidx\tcDNAlen\texonNum\tbegin\tend\tgenomicid\tgenomicidx\tbegin\tend\tidentity\tcoverage\n");
+ else
+ fprintf(stdout, "cDNAid\tcDNAlen\texonNum\tbegin\tend\tgenomicid\tbegin\tend\tidentity\tcoverage\n");
+
+ char E[1024], *Ep;
+ char G[1024], *Gp;
+ splitToWords W;
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ while (R->nextAlignment(p)) {
+ if (wholeEDefLine == true) {
+ Ep = p->_estDefLine;
+ } else {
+ W.split(p->_estDefLine);
+ strcpy(E, W[0] + ((W[0][0] == '>') ? 1 : 0));
+ Ep = E;
+ }
+
+ if (wholeGDefLine == true) {
+ Gp = p->_genDefLine;
+ } else {
+ W.split(p->_genDefLine);
+ strcpy(G, W[0] + ((W[0][0] == '>') ? 1 : 0));
+ Gp = G;
+ }
+
+ if (doExons == false) {
+ output(p, Ep, Gp, 0, p->_numExons-1, false);
+ } else {
+ for (uint32 i=0; i<p->_numExons; i++)
+ output(p, Ep, Gp, i, i, true);
+ }
+ }
+
+ return(0);
+}
+
diff --git a/sim4dbutils/coveragehack.C b/sim4dbutils/coveragehack.C
new file mode 100644
index 0000000..4910546
--- /dev/null
+++ b/sim4dbutils/coveragehack.C
@@ -0,0 +1,224 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "util++.H"
+#include "bio++.H"
+#include "sim4.H"
+
+// g++ -o coveragehack coveragehack.C -I../libutil -I../libbio -I../libsim4 -L../libutil -L../libbio -L../libsim4 -lsim4 -lbio -lutil
+
+// Flag that tells which side of the alignment our contaminated assembly is on.
+// 1 (R) -- if atac the contaminant is on the left, the assembly is on the right
+//
+uint32 orientation = 1;
+
+//
+// WARNING! This is stale code. It does not compile. The fasta interface has changed.
+//
+
+void
+readATAC(intervalList **coverage, char *path) {
+ char line[1024] = {0};
+ splitToWords S(line);
+
+ errno = 0;
+ FILE *F = fopen(path, "r");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", path, strerror(errno)), exit(1);
+
+ while (!feof(F)) {
+ fgets(line, 1024, F);
+
+ if ((line[0] == 'M') && (line[2] == 'u')) {
+ S.split(line);
+
+ uint32 taglength = 0;
+ while (S[8][taglength] != ':')
+ taglength++;
+ uint32 idx = atoi(S[8] + taglength + 1);
+ uint32 beg = atoi(S[9]);
+ uint32 len = atoi(S[10]);
+
+ if (orientation == 2) {
+ while (S[4][taglength] != ':')
+ taglength++;
+ idx = atoi(S[4] + taglength + 1);
+ beg = atoi(S[5]);
+ len = atoi(S[6]);
+ }
+
+ if (coverage[idx] == 0L)
+ coverage[idx] = new intervalList();
+
+ coverage[idx]->add(beg, len);
+ }
+ }
+
+ fclose(F);
+}
+
+
+void
+readSIM4(intervalList **coverage, int which, char *path) {
+
+ errno = 0;
+ FILE *F = fopen(path, "r");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", path, strerror(errno)), exit(1);
+
+ while (!feof(F)) {
+ sim4polish *p = new sim4polish(F);
+
+ if (p) {
+
+ switch (which) {
+ case 1:
+ // The query are contaminant reads, the genomic is the assembly
+ if ((p->_percentIdentity >= 94) && (p->_querySeqIdentity >= 80)) {
+ uint32 idx = p->_genID;
+
+ if (coverage[idx] == 0L)
+ coverage[idx] = new intervalList();
+
+ coverage[idx]->add(p->_exons[0]._genFrom,
+ p->_exons[0]._genTo - p->_exons[0]._genFrom + 1);
+ }
+ break;
+ case 2:
+ // The query are assembly scaffolds, the genomic is the contaminant assembly (one or a few contigs)
+ //
+ uint32 idx = p->_estID;
+
+ if (coverage[idx] == 0L)
+ coverage[idx] = new intervalList();
+
+ if (p->_matchOrientation == SIM4_MATCH_FORWARD) {
+ coverage[idx]->add(p->_exons[0]._estFrom,
+ p->_exons[0]._estTo - p->_exons[0]._estFrom + 1);
+ } else {
+ coverage[idx]->add(p->_estLen - p->_exons[0]._estTo + 1,
+ p->_exons[0]._estTo - p->_exons[0]._estFrom + 1);
+ }
+ break;
+ }
+
+ delete p;
+ }
+ }
+ fclose(F);
+}
+
+
+#define MAXSCAFFOLD 200000
+
+int
+main(int argc, char **argv) {
+ intervalList **coverage = new intervalList* [MAXSCAFFOLD];
+ intervalList **gaps = new intervalList* [MAXSCAFFOLD];
+ FastAWrapper *W = 0L;
+ uint32 minCov = 80;
+
+ bool includeGapsAsContamination = true;
+
+ for (uint32 i=0; i<MAXSCAFFOLD; i++)
+ coverage[i] = 0L;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-a") == 0) {
+ readATAC(coverage, argv[++arg]);
+ } else if (strcmp(argv[arg], "-s") == 0) {
+ // SNAPPER of query fragments onto the contaminant.
+ readSIM4(coverage, 1, argv[++arg]);
+ } else if (strcmp(argv[arg], "-S") == 0) {
+ // SNAPPER of query contaminant onto scaffolds
+ readSIM4(coverage, 2, argv[++arg]);
+ } else if (strcmp(argv[arg], "-f") == 0) {
+ W = new seqCache(argv[++arg]);
+ } else if (strcmp(argv[arg], "-c") == 0) {
+ minCov = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-r") == 0) {
+ orientation = 1;
+ } else if (strcmp(argv[arg], "-l") == 0) {
+ orientation = 2;
+ } else if (strcmp(argv[arg], "-g") == 0) {
+ includeGapsAsContamination = false;
+ } else {
+ fprintf(stderr, "%s: unknown arg %s\n", argv[0], argv[arg]);
+ }
+ arg++;
+ }
+
+ if (W == 0L) {
+ fprintf(stderr, "usage: %s [-a atacmapping] [-s sim4db] [-g] -f seq.fasta\n", argv[0]);
+ fprintf(stderr, " -g don't count gaps in scaffolds as contamination.\n");
+ exit(1);
+ }
+
+ uint32 sumOfLengths = 0;
+ uint32 sequences = 0;
+
+ for (uint32 i=0; i<MAXSCAFFOLD; i++) {
+ if (coverage[i]) {
+
+ W->find(i);
+ FastASequenceInCore *S = W->getSequence();
+
+ intervalList gaps;
+
+ // Compute how much of the scaffold is gap.
+
+ uint32 gapBeg = W->sequenceLength(i);
+ char *seq = S->sequence();
+
+ for (uint32 beg=0, len=W->sequenceLength(i); beg<len; beg++) {
+ if ((seq[beg] == 'N') || (seq[beg] == 'n')) {
+ if (gapBeg > beg)
+ gapBeg = beg;
+ } else {
+ if (gapBeg < beg) {
+ gaps.add(gapBeg, beg-gapBeg);
+ gapBeg = W->sequenceLength(i);
+ }
+ }
+ }
+
+ // Geez! I suppose we could have just directly counted ACGT above!
+
+ gaps.merge();
+ coverage[i]->merge();
+
+ uint32 coveredLength = coverage[i]->sumOfLengths();
+ uint32 gapLength = gaps.sumOfLengths();
+ uint32 totalLength = W->sequenceLength(i) - gapLength;
+
+ if (100 * coveredLength > minCov * totalLength) {
+
+ sumOfLengths += coveredLength;
+ sequences++;
+
+ double cov = 100.0 * coveredLength / (double)totalLength;
+
+ fprintf(stderr, "sequence ["uint32FMT"] %s covered "uint32FMT" out of "uint32FMT" (%7.3f)\n",
+ i,
+ S->header(),
+ coveredLength,
+ totalLength,
+ cov);
+
+ delete S;
+ }
+
+ // Dump a special scaffold
+ if (i == 4796) {
+ for (uint32 z=0; z<coverage[i]->numberOfIntervals(); z++) {
+ fprintf(stderr, "interval[%3d] %6d - %6d\n", z, coverage[i]->lo(z), coverage[i]->hi(z));
+ }
+
+ }
+
+ }
+ }
+
+ fprintf(stderr, "Found "uint32FMT" bases in "uint32FMT" scaffolds.\n", sumOfLengths, sequences);
+}
+
diff --git a/sim4dbutils/depthOfPolishes.C b/sim4dbutils/depthOfPolishes.C
new file mode 100644
index 0000000..961c1f1
--- /dev/null
+++ b/sim4dbutils/depthOfPolishes.C
@@ -0,0 +1,118 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "sim4.H"
+
+// ./genomics/sim4dbutils/depthOfPolishes -v < runA.1.ms12.filtered.sim4db > depth-out
+// plot [112000:113000][] "depth-out" using 2 with lines
+
+int
+main(int argc, char **argv) {
+ uint32 genomeLength = 0;
+ uint32 seqIdx = 0;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-l", 2) == 0) {
+ genomeLength = strtouint32(argv[++arg], 0L);
+
+ } else if (strncmp(argv[arg], "-s", 2) == 0) {
+ seqIdx = strtouint32(argv[++arg], 0L);
+
+ } else {
+ fprintf(stderr, "Unknown arg '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ intervalList<uint64> IL;
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ while (R->nextAlignment(p)) {
+ uint32 beg = p->_exons[0]._genFrom - 1;
+ uint32 end = p->_exons[p->_numExons-1]._genTo;
+
+ if (p->_genID != seqIdx)
+ continue;
+
+ if (end > genomeLength)
+ genomeLength = end;
+
+ IL.add(beg, end-beg);
+ }
+
+ intervalList<uint64> ID(IL);
+
+ // The extra 1000 here is so we can be lazy in the
+ // output section when computing averages.
+ //
+ uint32 *DD = new uint32 [genomeLength + 1000];
+ for (uint32 i=0; i<genomeLength + 1000; i++)
+ DD[i] = 0;
+
+ for (uint32 i=0; i<ID.numberOfIntervals(); i++) {
+ uint32 l = ID.lo(i);
+ uint32 h = ID.hi(i);
+ uint32 d = ID.count(i);
+
+ while (l < h) {
+ DD[l] = d;
+ l++;
+ }
+ }
+
+ // This stolen to leaff.C for %GC computation
+
+ uint32 ave3 = 0;
+ uint32 ave5 = 0;
+ uint32 ave11 = 0;
+ uint32 ave51 = 0;
+ uint32 ave101 = 0;
+ uint32 ave201 = 0;
+ uint32 ave501 = 0;
+ uint32 ave1001 = 0;
+ uint32 ave2001 = 0;
+
+ // Preload the averages
+ ave3 += DD[0];
+ ave5 += DD[0] + DD[1];
+
+ for (uint32 i=0; i<5; i++) ave11 += DD[i];
+ for (uint32 i=0; i<25; i++) ave51 += DD[i];
+ for (uint32 i=0; i<50; i++) ave101 += DD[i];
+ for (uint32 i=0; i<100; i++) ave201 += DD[i];
+ for (uint32 i=0; i<250; i++) ave501 += DD[i];
+ for (uint32 i=0; i<500; i++) ave1001 += DD[i];
+ for (uint32 i=0; i<1000; i++) ave2001 += DD[i];
+
+ for (uint32 i=0; i<genomeLength; i++) {
+ ave3 += DD[i+1] - ((i > 1) ? DD[i-2] : 0);
+ ave5 += DD[i+2] - ((i > 2) ? DD[i-3] : 0);
+ ave11 += DD[i+5] - ((i > 5) ? DD[i-6] : 0);
+ ave51 += DD[i+25] - ((i > 25) ? DD[i-25] : 0);
+ ave101 += DD[i+50] - ((i > 50) ? DD[i-51] : 0);
+ ave201 += DD[i+100] - ((i > 100) ? DD[i-101] : 0);
+ ave501 += DD[i+250] - ((i > 250) ? DD[i-251] : 0);
+ ave1001 += DD[i+500] - ((i > 500) ? DD[i-501] : 0);
+ ave2001 += DD[i+1000] - ((i > 1000) ? DD[i-1001] : 0);
+
+ fprintf(stdout, uint32FMT"\t"uint32FMT"\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
+ i,
+ DD[i],
+ ave3 / (double)((i >= 1) ? 3 - ((i < genomeLength - 1) ? 0 : i + 2 - genomeLength) : i+2),
+ ave5 / (double)((i >= 2) ? 5 - ((i < genomeLength - 2) ? 0 : i + 3 - genomeLength) : i+3),
+ ave11 / (double)((i >= 5) ? 11 - ((i < genomeLength - 4) ? 0 : i + 5 - genomeLength) : i+6),
+ ave51 / (double)((i >= 25) ? 51 - ((i < genomeLength - 24) ? 0 : i + 25 - genomeLength) : i+26),
+ ave101 / (double)((i >= 50) ? 101 - ((i < genomeLength - 49) ? 0 : i + 50 - genomeLength) : i+51),
+ ave201 / (double)((i >= 100) ? 201 - ((i < genomeLength - 99) ? 0 : i + 100 - genomeLength) : i+101),
+ ave501 / (double)((i >= 250) ? 501 - ((i < genomeLength - 249) ? 0 : i + 250 - genomeLength) : i+251),
+ ave1001 / (double)((i >= 500) ? 1001 - ((i < genomeLength - 499) ? 0 : i + 500 - genomeLength) : i+501),
+ ave2001 / (double)((i >= 1000) ? 2001 - ((i < genomeLength - 999) ? 0 : i + 1000 - genomeLength) : i+1001));
+ }
+
+ return(0);
+}
diff --git a/sim4dbutils/detectChimera.C b/sim4dbutils/detectChimera.C
new file mode 100644
index 0000000..74d131a
--- /dev/null
+++ b/sim4dbutils/detectChimera.C
@@ -0,0 +1,172 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "sim4.H"
+
+// Attempts to look for query that are chimeric. It is assumed that
+// your query have been mapped to a target reference genome, such
+// that little pieces will be mapped. The heuristic used is simple.
+// The mapping intervals are merged together, and if there are two
+// blocks that do not overlap, then it is chimeric. Intervals are
+// decreased by 3bp before merging.
+
+#define QUERY_LENGTH 2048
+
+int
+main(int argc, char **argv) {
+ bool beVerbose = false;
+ uint32 chimeraOverlap = 5;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-v", 2) == 0) {
+ beVerbose = true;
+ } else if (strncmp(argv[arg], "-o", 2) == 0) {
+ chimeraOverlap = strtouint32(argv[++arg], 0L);
+ } else {
+ fprintf(stderr, "Unknown arg '%s'\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ intervalList<uint64> IL;
+ intervalList<uint64> ILfull;
+ uint32 ILid = 0;
+ char lastdefline[1024] = { 0 };
+
+ uint32 numPts = 0;
+ uint32 maxPts = 1024;
+ uint32 *begPt = new uint32 [maxPts];
+ uint32 *endPt = new uint32 [maxPts];
+
+ uint32 *genBeg = new uint32 [maxPts];
+ uint32 *genEnd = new uint32 [maxPts];
+
+ uint32 queryLength = 0;
+
+ char spaces[QUERY_LENGTH+1];
+ char lines[QUERY_LENGTH+1];
+ char equals[QUERY_LENGTH+1];
+
+ for (uint32 i=0; i<QUERY_LENGTH; i++) {
+ spaces[i] = ' ';
+ lines[i] = '-';
+ equals[i] = '=';
+ }
+ spaces[QUERY_LENGTH] = 0;
+ lines[QUERY_LENGTH] = 0;
+ equals[QUERY_LENGTH] = 0;
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ while (R->nextAlignment(p)) {
+ if ((p->_estID != ILid) &&
+ (lastdefline[0])) {
+
+#if 0
+ fprintf(stdout, "\n\n");
+
+ fprintf(stdout, "IL "uint32FMT"\n", IL.numberOfIntervals());
+ for (uint32 i=0; i<IL.numberOfIntervals(); i++)
+ fprintf(stderr, "IL["uint32FMTW(3)"] "uint64FMT" "uint64FMT"\n", i, IL.lo(i), IL.hi(i));
+
+ fprintf(stdout, "ILfull "uint32FMT"\n", ILfull.numberOfIntervals());
+ for (uint32 i=0; i<ILfull.numberOfIntervals(); i++)
+ fprintf(stderr, "ILfull["uint32FMTW(3)"] "uint64FMT" "uint64FMT"\n", i, ILfull.lo(i), ILfull.hi(i));
+#endif
+
+ IL.merge();
+ ILfull.merge();
+
+ if ((IL.numberOfIntervals() > 1) &&
+ (ILfull.sumOfLengths() >= 0.9 * queryLength)) {
+ fprintf(stdout, "%s\n", lastdefline);
+
+ equals[queryLength] = 0;
+ fprintf(stdout, " %s\n", equals);
+ equals[queryLength] = '=';
+
+ // Bubble sort the positions.
+ //
+ for (uint32 a=0; a<numPts; a++) {
+ for (uint32 b=a+1; b<numPts; b++) {
+ if ((begPt[a] > begPt[b]) ||
+ ((begPt[a] == begPt[b]) && (endPt[a] > endPt[b]))) {
+ uint32 x = begPt[a];
+ uint32 y = endPt[a];
+ begPt[a] = begPt[b];
+ endPt[a] = endPt[b];
+ begPt[b] = x;
+ endPt[b] = y;
+
+ x = genBeg[a];
+ y = genEnd[a];
+ genBeg[a] = genBeg[b];
+ genEnd[a] = genEnd[b];
+ genBeg[b] = x;
+ genEnd[b] = y;
+ }
+ }
+ }
+
+
+ for (uint32 i=0; i<numPts && i<maxPts; i++) {
+ if (begPt[i] >= QUERY_LENGTH) {
+ fprintf(stdout, "WARNING: Next line (begin) truncated to %d positions!\n", QUERY_LENGTH);
+ begPt[i] = QUERY_LENGTH-1;
+ }
+ if (endPt[i] >= QUERY_LENGTH) {
+ fprintf(stdout, "WARNING: Next line (end) truncated to %d positions!\n", QUERY_LENGTH);
+ endPt[i] = QUERY_LENGTH-1;
+ }
+
+
+ spaces[begPt[i]] = 0;
+ lines[endPt[i] - begPt[i]] = 0;
+ fprintf(stdout, uint32FMTW(3)"-"uint32FMTW(3)" %s%s ("uint32FMT","uint32FMT")\n",
+ begPt[i], endPt[i], spaces, lines, genBeg[i], genEnd[i]);
+ spaces[begPt[i]] = ' ';
+ lines[endPt[i] - begPt[i]] = '-';
+ }
+
+ fprintf(stdout, "\n\n");
+ } // end of chimera detected
+
+ IL.clear();
+ ILfull.clear();
+ numPts = 0;
+ }
+
+ strcpy(lastdefline, p->_estDefLine);
+ ILid = p->_estID;
+
+ queryLength = p->_estLen;
+
+ uint32 beg = p->_exons[0]._estFrom - 1;
+ uint32 end = p->_exons[p->_numExons-1]._estTo;
+
+ if (numPts == maxPts) {
+ fprintf(stdout, "Wow! The next guy is a deep mapping! I'm only showing the\n");
+ fprintf(stdout, "first "uint32FMT" alignments.\n", maxPts);
+ } else if (numPts < maxPts) {
+ begPt[numPts] = beg;
+ endPt[numPts] = end;
+ genBeg[numPts] = p->_exons[0]._genFrom - 1;
+ genEnd[numPts] = p->_exons[p->_numExons-1]._genTo;
+ }
+ numPts++;
+
+ //fprintf(stdout, "beg,end = %d,%d\n", (int)beg, (int)end);
+
+ if (end - beg > 2 * chimeraOverlap) {
+ IL.add(beg + chimeraOverlap, end - beg - 2 * chimeraOverlap);
+ ILfull.add(beg, end - beg);
+ }
+ }
+
+ return(0);
+}
+
diff --git a/sim4dbutils/doc.txt b/sim4dbutils/doc.txt
new file mode 100644
index 0000000..bf4bdbc
--- /dev/null
+++ b/sim4dbutils/doc.txt
@@ -0,0 +1,82 @@
+sim4db tools as of July 17, 2006
+
+filterPolishes
+headPolishes
+mappedCoverage
+mergePolishes
+sortPolishes
+pickBestPolish
+pickUniquePolish
+pickUniquePolish-nhgri
+cleanPolishes
+fixPolishesIID
+plotIntronSize
+plotCoverageVsIdentity
+parseSNP
+comparePolishes
+convertToAtac
+trimSequencesBasedOnMatches
+uniqPolishes
+summarizePolishes
+removeDuplicate
+vennPolishes
+realignPolishes
+removeRedundant
+reportAlignmentDifferences
+
+----------------------------------------
+
+filterPolishes
+
+Filters polishes by percent identity, percent coverage, length of
+alignment (number of matches), number of exons, query of genomic IID.
+Can segregate polishes, placing polishes for each genomic IID into a
+separate file.
+
+Also, can remove deflines or alignments, and can "normalize" the
+genomic coordinates by adding in the match offset.
+
+----------------------------------------
+
+headPolishes
+
+Like the UNIX head command, returns the first N polishes in a file.
+
+----------------------------------------
+
+mappedCoverage
+
+Returns the percentage of each query that is covered by an alignment.
+Also can mask out those regions with N's.
+
+----------------------------------------
+
+mergePolishes
+
+Merges multiple sets of polishes, mapped to the same genomic
+sequences, into one file, updating the query IID.
+
+----------------------------------------
+
+sortPolishes
+
+
+
+pickBestPolish
+pickUniquePolish
+pickUniquePolish-nhgri
+cleanPolishes
+fixPolishesIID
+plotIntronSize
+plotCoverageVsIdentity
+parseSNP
+comparePolishes
+convertToAtac
+trimSequencesBasedOnMatches
+uniqPolishes
+summarizePolishes
+removeDuplicate
+vennPolishes
+realignPolishes
+removeRedundant
+reportAlignmentDifferences
diff --git a/sim4dbutils/filterPolishes.C b/sim4dbutils/filterPolishes.C
new file mode 100644
index 0000000..7cab758
--- /dev/null
+++ b/sim4dbutils/filterPolishes.C
@@ -0,0 +1,297 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "bio.h"
+#include "sim4.H"
+
+int
+main(int argc, char ** argv) {
+ uint32 minC = 0;
+ uint32 minI = 0;
+ uint32 minL = 0;
+ uint32 cdna = ~uint32ZERO;
+ uint32 geno = ~uint32ZERO;
+ uint32 minExons = 0;
+ uint32 maxExons = ~uint32ZERO;
+ uint32 beVerbose = 0;
+ int GOODsilent = 0;
+ sim4polishWriter *GOOD = 0L;
+ int CRAPsilent = 0;
+ sim4polishWriter *CRAP = 0L;
+ sim4polishWriter *JUNK = 0L;
+ uint64 pmod = 1;
+ uint64 good = 0;
+ uint64 crap = 0;
+ uint64 junk = 0;
+ int doSelfFilter = 0;
+ int doSegregation = 0;
+ uint32 doSegregationLo = 0;
+ uint32 doSegregationHi = 0;
+ char *filePrefixGOOD = 0L;
+ char *filePrefixCRAP = 0L;
+ char *filePrefixJUNK = 0L;
+ sim4polishWriter **SEGREGATE = 0L;
+ bool noDefLines = false;
+ bool noAlignments = false;
+ bool doGFF3 = false;
+ sim4polishStyle style = sim4polishStyleDefault;
+
+ // We limit scaffolds to be below the number of open files per
+ // process.
+ //
+ uint32 maxScaffold = sysconf(_SC_OPEN_MAX);
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-verbose", 2) == 0) {
+ beVerbose = 1;
+
+ } else if (strncmp(argv[arg], "-c", 2) == 0) {
+ minC = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-i", 2) == 0) {
+ minI = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-l", 2) == 0) {
+ minL = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-minexons", 3) == 0) {
+ minExons = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-maxexons", 3) == 0) {
+ maxExons = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-o", 2) == 0) {
+ filePrefixGOOD = argv[++arg];
+ GOODsilent = 0;
+
+ } else if (strncmp(argv[arg], "-O", 2) == 0) {
+ GOODsilent = 1;
+
+ } else if (strncmp(argv[arg], "-d", 2) == 0) {
+ filePrefixCRAP = argv[++arg];
+ CRAPsilent = 0;
+
+ } else if (strncmp(argv[arg], "-q", 2) == 0) {
+ CRAPsilent = 1;
+
+ } else if (strncmp(argv[arg], "-D", 2) == 0) {
+ CRAPsilent = 1;
+
+ } else if (strncmp(argv[arg], "-j", 2) == 0) {
+ filePrefixJUNK = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-C", 2) == 0) {
+ cdna = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-G", 2) == 0) {
+ geno = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-selfhits", 4) == 0) {
+ doSelfFilter = 1;
+
+ } else if (strncmp(argv[arg], "-segregate", 4) == 0) {
+ doSegregation = 1;
+ doSegregationLo = atoi(argv[++arg]);
+ doSegregationHi = atoi(argv[++arg]);
+ if (doSegregationHi - doSegregationLo + 1 > maxScaffold)
+ fprintf(stderr, "error: -segregate range too big; must be less than %u.\n", maxScaffold), exit(1);
+ SEGREGATE = new sim4polishWriter * [maxScaffold];
+ memset(SEGREGATE, 0, sizeof(sim4polishWriter *) * maxScaffold);
+
+ } else if (strncmp(argv[arg], "-nodeflines", 4) == 0) {
+ noDefLines = true;
+
+ } else if (strncmp(argv[arg], "-noalignments", 4) == 0) {
+ noAlignments = true;
+
+ } else if (strncmp(argv[arg], "-gff3", 4) == 0) {
+ doGFF3 = true;
+ style = sim4polishGFF3;
+
+ } else {
+ fprintf(stderr, "UNKNOWN option '%s'\n", argv[arg]);
+ exit(1);
+ }
+
+ arg++;
+ }
+
+ if (isatty(fileno(stdin))) {
+ fprintf(stderr, "usage: %s [-c c] [-i i] [-o o]\n", argv[0]);
+ fprintf(stderr, " -verbose Report progress\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -c c Discard polishes below c%% composite (default: 0).\n");
+ fprintf(stderr, " -i i Discard polishes below i%% identity (default: 0).\n");
+ fprintf(stderr, " -l l Discard polishes below l identities (default: 0).\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -minexons e Discard polishes below e exons (default: 0).\n");
+ fprintf(stderr, " -maxexons e Discard polishes above e exons (default: infinity).\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -C c Discard polishes that are not from cDNA idx 'c'\n");
+ fprintf(stderr, " -G g Discard polishes that are not from genomic idx 'g'\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -o o Write saved polishes to the 'o' file (default == stdout).\n");
+ fprintf(stderr, " -O Don't write saved polishes.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -d o Write discarded polishes to the 'o' file (default == stdout).\n");
+ fprintf(stderr, " -D Don't write discarded polishes.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -j o Write intractable and aborted polishes to the 'o' file. By\n");
+ fprintf(stderr, " default these are silently discarded.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -selfhits Filter out alignments to ourself -- if you did an all-to-all\n");
+ fprintf(stderr, " mapping of a set onto itself. Deflines needed!\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -segregate a b Segregate polishes by genomic idx, for idx's between a and b inclusive.\n");
+ fprintf(stderr, " b-a must be less than %u.\n", maxScaffold);
+ fprintf(stderr, " Must be used with -o.\n");
+ fprintf(stderr, " Will create numerous files 'o.%%05d'.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -nodeflines Strip out deflines.\n");
+ fprintf(stderr, " -noalignments Strip out alignments.\n");
+ fprintf(stderr, " -gff3 Write output in GFF3 format.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " All conditions must be met.\n");
+ exit(1);
+ }
+
+ if ((CRAPsilent == 0) && (GOODsilent == 0) && (filePrefixGOOD == 0L) && (filePrefixCRAP == 0L)) {
+ fprintf(stderr, "error: filter has no effect; saved and discarded polishes\n");
+ fprintf(stderr, " both printed to the same place!\n");
+ fprintf(stderr, " (try using one of -o, -O, -d, -D)\n");
+ exit(1);
+ }
+
+ if (doSegregation && (filePrefixGOOD == 0L)) {
+ fprintf(stderr, "error: you must specify a file prefix when segregating (-s requires -o)\n");
+ exit(1);
+ }
+
+ if (noDefLines && doGFF3)
+ fprintf(stderr, "warning: No deflines option inactive with GFF3.\n");
+
+ if (beVerbose) {
+ fprintf(stderr, "Filtering at "uint32FMT"%% coverage and "uint32FMT"%% identity and "uint32FMT"bp.\n", minC, minI, minL);
+
+ if ((cdna != ~uint32ZERO) && (cdna != ~uint32ZERO))
+ fprintf(stderr, "Filtering for cDNA idx "uint32FMT" and genomic idx "uint32FMT"\n", cdna, geno);
+ else if (cdna != ~uint32ZERO)
+ fprintf(stderr, "Filtering for cDNA idx "uint32FMT".\n", cdna);
+ else if (geno != ~uint32ZERO)
+ fprintf(stderr, "Filtering for genomic idx "uint32FMT".\n", geno);
+ }
+
+ // Prepare input files
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ if (R->getsim4polishStyle() != style)
+ fprintf(stderr, "warning: input format and output format differ.\n");
+
+ // Prepare output files
+
+ if (filePrefixGOOD != 0L)
+ GOOD = new sim4polishWriter(filePrefixGOOD, style);
+ if (filePrefixCRAP != 0L)
+ CRAP = new sim4polishWriter(filePrefixCRAP, style);
+ if (filePrefixJUNK != 0L)
+ JUNK = new sim4polishWriter(filePrefixJUNK, style);
+
+ if ((CRAPsilent == 0) && (CRAP == 0L))
+ CRAP = new sim4polishWriter("-", sim4polishS4DB);
+
+ if ((GOODsilent == 0) && (GOOD == 0L))
+ GOOD = new sim4polishWriter("-", sim4polishS4DB);
+
+
+ // Start processing
+
+ while (R->nextAlignment(p)) {
+
+ if (noDefLines && (doGFF3 == false))
+ p->s4p_removeDefLines();
+ if (noAlignments)
+ p->s4p_removeAlignments();
+
+ if (JUNK && ((p->_strandOrientation == SIM4_STRAND_INTRACTABLE) ||
+ (p->_strandOrientation == SIM4_STRAND_FAILED))) {
+ junk++;
+ JUNK->writeAlignment(p);
+ } else {
+ if ((p->_percentIdentity >= minI) &&
+ (p->_querySeqIdentity >= minC) &&
+ (p->_numCovered >= minL) &&
+ ((cdna == ~uint32ZERO) || (cdna == p->_estID)) &&
+ ((geno == ~uint32ZERO) || (geno == p->_genID)) &&
+ (minExons <= p->_numExons) &&
+ (p->_numExons <= maxExons) &&
+ ((doSelfFilter == 0) || (strcmp(p->_estDefLine, p->_genDefLine) != 0))) {
+ good++;
+ if (doSegregation) {
+ if ((doSegregationLo <= p->_genID) &&
+ (p->_genID <= doSegregationHi)) {
+ if (SEGREGATE[p->_genID - doSegregationLo] == 0L) {
+ char filename[1024];
+ sprintf(filename, "%s.%04d", filePrefixGOOD, (int)p->_genID);
+ SEGREGATE[p->_genID - doSegregationLo] = new sim4polishWriter(filename, sim4polishS4DB);
+ }
+ SEGREGATE[p->_genID - doSegregationLo]->writeAlignment(p);
+ }
+ } else {
+ if (!GOODsilent)
+ GOOD->writeAlignment(p);
+ }
+ } else {
+ crap++;
+ if (!CRAPsilent)
+ CRAP->writeAlignment(p);
+ }
+ }
+
+ if ((beVerbose) && ((good+crap) == pmod)) {
+ pmod += 8888 + (random() % 1000);
+ if (junk > 0)
+ fprintf(stderr, " Filter: %6.2f%% ("uint64FMT" matches processed) ("uint64FMT" failed/intractable)\r",
+ 100.0 * good / (good+crap),
+ good+crap,
+ junk);
+ else
+ fprintf(stderr, " Filter: %6.2f%% ("uint64FMT" matches processed)\r",
+ 100.0 * good / (good+crap),
+ good+crap);
+ fflush(stderr);
+ }
+ }
+
+
+ if (beVerbose) {
+ if (junk > 0)
+ fprintf(stderr, " Filter: %6.2f%% ("uint64FMT" matches processed) ("uint64FMT" failed/intractable)\n",
+ 100.0 * good / (good+crap),
+ good+crap,
+ junk);
+ else
+ fprintf(stderr, " Filter: %6.2f%% ("uint64FMT" matches processed)\n",
+ 100.0 * good / (good+crap),
+ good+crap);
+ }
+
+ delete R;
+
+ if (doSegregation) {
+ for (uint32 i=0; i<maxScaffold; i++)
+ if (SEGREGATE[i])
+ delete SEGREGATE[i];
+ delete [] SEGREGATE;
+ }
+
+ delete GOOD;
+ delete JUNK;
+
+ return(0);
+}
diff --git a/sim4dbutils/fixPolishesIID.C b/sim4dbutils/fixPolishesIID.C
new file mode 100644
index 0000000..cefe0c1
--- /dev/null
+++ b/sim4dbutils/fixPolishesIID.C
@@ -0,0 +1,128 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "bio.h"
+#include "sim4.H"
+
+#include <map>
+#include <string>
+
+using namespace std;
+
+// Updates the IID's in a set of polishes. If a file of deflines (or
+// fasta file) is supplied, the IIDs will match those, otherwise,
+// they remain the same.
+
+void
+addToDict(map<string, uint64> &d, char *n) {
+
+ if (n == 0L)
+ return;
+
+ seqCache *F = new seqCache(n);
+ seqInCore *S = F->getSequenceInCore();
+
+ while (S) {
+ string s = S->header();
+
+ d[s] = S->getIID();
+
+ delete S;
+ S = F->getSequenceInCore();
+ }
+
+ delete F;
+}
+
+
+
+int
+main(int argc, char **argv) {
+ char *cDeflines = 0L;
+ char *gDeflines = 0L;
+
+ sim4polishStyle style = sim4polishStyleDefault;
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-c") == 0) {
+ cDeflines = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-g") == 0) {
+ gDeflines = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-gff3") == 0) {
+ style = sim4polishGFF3;
+
+ } else {
+ fprintf(stderr, "Unknown arg: %s\n", argv[arg]);
+ }
+ arg++;
+ }
+ if (isatty(fileno(stdin))) {
+ fprintf(stderr, "usage: %s [-c c.fasta] [-g g.fasta] [-gff3] < polishes > polishes\n", argv[0]);
+ fprintf(stderr, " -c c.fasta Read cDNA deflines from c.fasta\n");
+ fprintf(stderr, " -g g.fasta Read genomic deflines from g.fasta\n");
+ fprintf(stderr, " -gff3 Write output as GFF3\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Rewrites the input polishes, updating the sequence index to match\n");
+ fprintf(stderr, " that of the associated fasta file. One or both of -c and -g may be used.\n");
+ fprintf(stderr, " Polishes that refer to a sequence not present in the input fasta file are\n");
+ fprintf(stderr, " not output.\n");
+ exit(1);
+ }
+
+ // We parse args, then build the dictionaries, so we can do
+ // any quick error detection first.
+
+ map<string, uint64> g;
+ map<string, uint64> c;
+
+ if (gDeflines) {
+ fprintf(stderr, "Reading genomic deflines from '%s'\n", gDeflines);
+ addToDict(g, gDeflines);
+ }
+
+ if (cDeflines) {
+ fprintf(stderr, "Reading genomic deflines from '%s'\n", cDeflines);
+ addToDict(c, cDeflines);
+ }
+
+ // Read all the matches, changing IIDs. If we find a defline
+ // with no IID, holler and die.
+
+ sim4polishWriter *W = new sim4polishWriter("-", style);
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ if (R->getsim4polishStyle() != style)
+ fprintf(stderr, "warning: input format and output format differ.\n");
+
+ fprintf(stderr, "Filtering polishes.\n");
+
+ while (R->nextAlignment(p)) {
+ string cd = p->_estDefLine;
+ string gd = p->_genDefLine;
+
+ if (cDeflines != 0L) {
+ if (c.find(cd) == c.end())
+ // EST defline not in the input sequences, don't output.
+ continue;
+ p->_estID = c[cd];
+ }
+
+ if (gDeflines != 0L) {
+ if (g.find(gd) == g.end())
+ // Genomic defline not in the input sequences, don't output.
+ continue;
+ p->_genID = g[gd];
+ }
+
+ W->writeAlignment(p);
+ }
+
+ delete R;
+ delete W;
+}
diff --git a/sim4dbutils/headPolishes.C b/sim4dbutils/headPolishes.C
new file mode 100644
index 0000000..38fc1d1
--- /dev/null
+++ b/sim4dbutils/headPolishes.C
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "sim4.H"
+
+// Writes n polishes from stdin to stdout, default 1.
+
+int
+main(int argc, char **argv) {
+ uint32 numToPrint = 1;
+ sim4polishReader *R = 0L;
+ sim4polishWriter *W = 0L;
+
+ sim4polishStyle style = sim4polishStyleDefault;
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-h", 2) == 0) {
+ err++;
+
+ } else if (strncmp(argv[arg], "-n", 2) == 0) {
+ numToPrint = atoi(argv[++arg]);
+
+ } else if (strcmp(argv[arg], "-gff3") == 0) {
+ style = sim4polishGFF3;
+
+ } else if (strncmp(argv[arg], "-", 1) == 0) {
+ numToPrint = atoi(argv[arg] + 1);
+
+ } else {
+ R = new sim4polishReader(argv[arg]);
+ }
+
+ arg++;
+ }
+ if ((err) || ((R == 0L) && (isatty(fileno(stdin))))) {
+ fprintf(stderr, "usage: %s [-h] [-# | -n #] [-gff3] [polishes-file]\n", argv[0]);
+ exit(1);
+ }
+
+ if (R == 0L)
+ R = new sim4polishReader("-");
+
+ if (W == 0L)
+ W = new sim4polishWriter("-", style);
+
+ if (R->getsim4polishStyle() != style)
+ fprintf(stderr, "warning: Input format and output format differ.\n");
+
+ sim4polish *p = 0L;
+
+ while ((numToPrint--) && (R->nextAlignment(p)))
+ W->writeAlignment(p);
+
+ delete W;
+ delete R;
+
+ return(0);
+}
+
diff --git a/sim4dbutils/mappedCoverage.C b/sim4dbutils/mappedCoverage.C
new file mode 100644
index 0000000..8cbd420
--- /dev/null
+++ b/sim4dbutils/mappedCoverage.C
@@ -0,0 +1,250 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "bio.h"
+#include "sim4.H"
+
+// Reports the amount of sequence covered by ALL matches for that
+// sequence. Example: if sequence iid 4 has two matches, one
+// covering the first 30% and the second covering the last 30%, this
+// will report that sequence iid 4 is covered 60%.
+//
+// Takes no options, reads from stdin, writes to stdout.
+
+int
+main(int argc, char **argv) {
+ uint32 covMax = 0;
+ intervalList<uint64> **cov = 0L;
+ uint32 *len = 0L;
+
+ uint32 lastIID = 0;
+
+ bool isRaw = false;
+ bool isBlast = false;
+
+ char *fastaname = 0L;
+ char *covname = 0L;
+
+ seqCache *F = 0L;
+
+ FILE *C = stdout;
+
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-mask") == 0) {
+ fastaname = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-cov") == 0) {
+ covname = argv[++arg];
+
+ } else if (strcmp(argv[arg], "-raw") == 0) {
+ isRaw = true;
+
+ } else if (strcmp(argv[arg], "-blast") == 0) {
+ isBlast = true;
+
+ } else {
+ fprintf(stderr, "unknown arg: '%s'\n", argv[arg]);
+ err++;
+ }
+ arg++;
+ }
+ if ((err) || (isatty(fileno(stdin)))) {
+ fprintf(stderr, "usage: %s [-mask in.fasta] [-cov dat] [-raw | -blast] < sim4db-results\n", argv[0]);
+ fprintf(stderr, " -mask Read sequences from in.fasta, lower-case mask\n");
+ fprintf(stderr, " any base with an alignment, write to out.fasta\n");
+ fprintf(stderr, " -cov Write coverage statistics to 'dat' instead of stdout\n");
+ fprintf(stderr, " -raw If present, assume the 'sim4db-results' are\n");
+ fprintf(stderr, " a space-separated list of 'iid begin end', one per line\n");
+ fprintf(stderr, " -blast Same idea as raw, expects 'UID.IID' for query id,\n");
+ fprintf(stderr, " blast format (-m) 9.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Output on stdout is the masked sequence if -mask is specified,\n");
+ fprintf(stderr, "otherwise, it is the coverage statistics.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "-mask is almost a required option - we need it to get the length.\n");
+ fprintf(stderr, "of sequences with no mapping (100%% uncovered) and to get the\n");
+ fprintf(stderr, "number of sequences.\n");
+ fprintf(stderr, "\n");
+ if (isatty(fileno(stdin)))
+ fprintf(stderr, "error: I cannot read polishes from the terminal!\n\n");
+ }
+
+ if (fastaname) {
+ C = 0L;
+ F = new seqCache(fastaname);
+ }
+
+ if (covname) {
+ errno = 0;
+ C = fopen(covname, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s' for write: %s\n", covname, strerror(errno)), exit(1);
+ }
+
+ covMax = 1024 * 1024;
+ if (F)
+ covMax = F->getNumberOfSequences();
+ cov = new intervalList<uint64> * [covMax];
+ len = new uint32 [covMax];
+
+ fprintf(stderr, "Found "uint32FMT" sequences in the input file.\n", covMax);
+
+ for (uint32 i=0; i<covMax; i++) {
+ cov[i] = 0L;
+ len[i] = 0;
+ }
+
+ if (isRaw || isBlast) {
+ char inLine[1024];
+ splitToWords S;
+
+ while (!feof(stdin)) {
+ fgets(inLine, 1024, stdin);
+ S.split(inLine);
+
+ uint32 iid=0, beg=0, end=0;
+
+ if (isRaw) {
+ iid = strtouint32(S[0], 0L);
+ beg = strtouint32(S[1], 0L) - 1; // Convert to space-based
+ end = strtouint32(S[2], 0L);
+ }
+ if (isBlast) {
+ char *iii = S[0];
+ while ((*iii != '.') && (*iii))
+ iii++;
+ iii++;
+ if (*iii == 0)
+ fprintf(stderr, "UID.IID error: '%s'\n", S[0]);
+
+ iid = strtouint32(iii, 0L);
+ beg = strtouint32(S[6], 0L) - 1; // Convert to space-based
+ end = strtouint32(S[7], 0L);
+ }
+
+ if (iid >= covMax) {
+ fprintf(stderr, "ERROR: Found iid "uint32FMT", but only allocated "uint32FMT" places!\n",
+ iid, covMax);
+ exit(1);
+ }
+ if (cov[iid] == 0L) {
+ cov[iid] = new intervalList<uint64>;
+ len[iid] = 0;
+ }
+ if (iid >= lastIID) {
+ lastIID = iid + 1;
+ }
+ cov[iid]->add(beg, end-beg);
+ }
+
+ } else {
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ while (R->nextAlignment(p)) {
+ if (p->_estID > covMax)
+ fprintf(stderr, "DIE! You have more sequences in your polishes than in your source!\n"), exit(1);
+
+ if (p->_estID >= covMax) {
+ fprintf(stderr, "ERROR: Found iid "uint32FMT", but only allocated "uint32FMT" places!\n",
+ p->_estID, covMax);
+ exit(1);
+ }
+ if (cov[p->_estID] == 0L) {
+ cov[p->_estID] = new intervalList<uint64>;
+ len[p->_estID] = p->_estLen;
+ }
+ if (p->_estID >= lastIID) {
+ lastIID = p->_estID + 1;
+ }
+
+ for (uint32 e=0; e<p->_numExons; e++) {
+ p->_exons[e]._estFrom--; // Convert to space-based
+
+ if (p->_matchOrientation == SIM4_MATCH_FORWARD)
+ cov[p->_estID]->add(p->_exons[e]._estFrom,
+ p->_exons[e]._estTo - p->_exons[e]._estFrom);
+ else
+ cov[p->_estID]->add(p->_estLen - p->_exons[e]._estTo,
+ p->_exons[e]._estTo - p->_exons[e]._estFrom);
+ }
+ }
+ }
+
+
+ // Scan the list of intervalLists, compute the amount covered, print.
+ //
+ for (uint32 iid=0; iid<lastIID; iid++) {
+
+ // Argh! If there are no intervals, we need to report the whole
+ // sequence is uncovered!
+
+ uint32 numRegions = 0;
+ uint32 sumLengths = 0;
+ uint32 l, h;
+
+ // Save the number of regions and the sum of their lengths,
+ // then merge regions
+ //
+ if (cov[iid]) {
+ numRegions = cov[iid]->numberOfIntervals();
+ sumLengths = cov[iid]->sumOfLengths();
+ cov[iid]->merge();
+ }
+
+ if (F) {
+ seqInCore *S = F->getSequenceInCore(iid);
+
+ if (len[iid] == 0)
+ len[iid] = S->sequenceLength();
+
+ assert(len[iid] == S->sequenceLength());
+
+ char *seq = new char [len[iid] + 1];
+ strcpy(seq, S->sequence());
+
+ for (uint32 p=0; p<len[iid]; p++)
+ seq[p] = toUpper[seq[p]];
+
+ if (cov[iid]) {
+ for (uint32 c=0; c<cov[iid]->numberOfIntervals(); c++) {
+ l = cov[iid]->lo(c);
+ h = cov[iid]->hi(c);
+
+ if (h > len[iid]) {
+ fprintf(stderr, "ERROR: range "uint32FMT"-"uint32FMT" out of bounds (seqLen = "uint32FMT")\n",
+ l, h, len[iid]);
+ assert(h <= len[iid]);
+ }
+
+ for (uint32 p=l; p<h; p++)
+ //seq[p] = toLower[seq[p]];
+ seq[p] = 'N';
+ }
+ }
+
+ fprintf(stdout, "%s\n%s\n", S->header(), seq);
+
+ delete [] seq;
+ delete S;
+ }
+
+ if (C) {
+ double percentCovered = 0.00;
+
+ if (cov[iid])
+ percentCovered = cov[iid]->sumOfLengths() / (double)len[iid];
+
+ fprintf(C, uint32FMT"\t"uint32FMT"\t%5.3f\t"uint32FMT"\t"uint32FMT"\n",
+ iid,
+ len[iid],
+ percentCovered,
+ numRegions,
+ sumLengths);
+ }
+ }
+}
diff --git a/sim4dbutils/mergePolishes.C b/sim4dbutils/mergePolishes.C
new file mode 100644
index 0000000..4f2ab98
--- /dev/null
+++ b/sim4dbutils/mergePolishes.C
@@ -0,0 +1,144 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "bio++.H"
+//#include "fasta.H"
+#include "sim4.H"
+
+// usage: mergeInput -m match1 cdna1 -m match2 cdna2 -m ... -o match cdna [-gff3]
+//
+// Merges the results from two ESTmapper runs. The runs MUST be on
+// the same genomic sequence using DIFFERENT cDNA inputs.
+
+static
+void
+loadNext(uint32 idx, sim4polish **polishes, sim4polishReader **inMatch, uint32 *numSeqs) {
+ if (inMatch[idx]->nextAlignment(polishes[idx]))
+ polishes[idx]->_estID += numSeqs[idx];
+}
+
+int
+main(int argc, char **argv) {
+ char **inMatchName = new char * [argc];
+ char **inSeqName = new char * [argc];
+ char *otMatchName = 0L;
+ char *otSeqName = 0L;
+
+ sim4polishReader **inMatch = new sim4polishReader * [argc];
+ sim4polish **polishes = new sim4polish * [argc];
+ sim4polishWriter *otMatch = 0L;
+ uint32 *numSeqs = new uint32 [argc];
+
+ uint32 numIn = 0;
+
+ sim4polishStyle style = sim4polishStyleDefault;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-m") == 0) {
+ arg++;
+ inMatchName[numIn] = (char *)argv[arg++];
+ inSeqName[numIn] = (char *)argv[arg++];
+
+ inMatch[numIn] = new sim4polishReader(inMatchName[numIn]);
+ numIn++;
+
+ } else if (strcmp(argv[arg], "-o") == 0) {
+ arg++;
+ otMatchName = (char *)argv[arg++];
+ otSeqName = (char *)argv[arg++];
+
+ } else if (strcmp(argv[arg], "-gff3") == 0) {
+ style = sim4polishGFF3;
+
+ }
+ }
+
+ if ((numIn < 1) || (otMatchName == 0L)) {
+ fprintf(stderr, "usage: %s -o match cdna -m match1 cdna1 -m match2 cdna2 -m ... [-gff3]\n", argv[0]);
+ exit(1);
+ }
+
+ otMatch = new sim4polishWriter(otMatchName, style);
+
+ for (uint32 i=0; i<numIn; i++)
+ if (inMatch[i]->getsim4polishStyle() != style) {
+ fprintf(stderr, "warning: input format and output format may differ.\n");
+ break;
+ }
+
+ // Merge the input sequences into the output sequence. We also count the number of sequences
+ // here, so we don't need random-access of the input.
+ //
+ fprintf(stderr, "Merging sequences.\n");
+
+ FILE *O = fopen(otSeqName, "w");
+ for (uint32 i=0; i<numIn; i++) {
+ seqCache *I = new seqCache(inSeqName[i]);
+ seqInCore *B = I->getSequenceInCore();
+
+ numSeqs[i] = 0;
+
+ while (B) {
+ fprintf(O, ">%s\n%s\n", B->header(), B->sequence());
+ numSeqs[i]++;
+
+ delete B;
+ B = I->getSequenceInCore();
+ }
+
+ delete I;
+ }
+ fclose(O);
+
+
+ // Make numSeqs[] be the offset needed to convert a polish in each inMatch[] file into a polish
+ // in the merged file.
+ //
+ uint32 o = 0;
+ uint32 s = 0;
+ for (uint32 i=0; i<numIn; i++) {
+ o = numSeqs[i];
+ numSeqs[i] = s;
+ s += o;
+ }
+
+ // Load the initial polishes
+ //
+ for (uint32 i=0; i<numIn; i++)
+ loadNext(i, polishes, inMatch, numSeqs);
+
+ // Merge, until no more input is left. Each round we scan the list of loaded polishes[] and
+ // remember the lowest, which is then output and a new polish is loaded in its place.
+ //
+ bool keepGoing = true;
+ while (keepGoing) {
+
+ uint32 first = 0;
+ while ((polishes[first] == 0L) && (first < numIn))
+ first++;
+
+ if (polishes[first] == 0L) {
+ keepGoing = 0L;
+ continue;
+ }
+
+ for (uint32 i=first+1; i<numIn; i++)
+ if ((polishes[i]) &&
+ (s4p_genIDcompare(polishes + first, polishes + i) > 0))
+ first = i;
+
+ otMatch->writeAlignment(polishes[first]);
+
+ loadNext(first, polishes, inMatch, numSeqs);
+ }
+
+ delete [] polishes;
+
+ delete inMatch;
+ delete otMatch;
+}
+
+
+
diff --git a/sim4dbutils/parseSNP.C b/sim4dbutils/parseSNP.C
new file mode 100644
index 0000000..f1c61b6
--- /dev/null
+++ b/sim4dbutils/parseSNP.C
@@ -0,0 +1,592 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+#include <ctype.h>
+#include <errno.h>
+
+#include "bio.h"
+#include "sim4.H"
+
+// Writes things with mappings that don't contain the snp itself to a
+// failure file. Otherwise, if the mapping is above the threshold, a
+// line describing the snp is output.
+
+sim4polishWriter *multiMultiFile = 0L; // multiple hits, at least one is multiple exon
+sim4polishWriter *multiSingleFile = 0L; // multiple hits, all are single exon
+sim4polishWriter *singleMultiFile = 0L; // single hit, and it has more than one exon
+sim4polishWriter *singleSingleFile = 0L; // single hit, single exon
+
+int smpass = 0;
+int sspass = 0;
+int mmpass = 0;
+int mspass = 0;
+
+int smfail = 0;
+int ssfail = 0;
+int mmfail = 0;
+int msfail = 0;
+
+int failedsnps = 0;
+int failedmatches = 0;
+
+FILE *validSNPMapFile = 0L;
+sim4polishWriter *failedSNPMapFile = 0L;
+
+char fieldDelimiter = 0;
+const char *sizeTag = "/size=";
+const char *posTag = "/pos=";
+int positionOffset = 0;
+
+int outputFormat = 1;
+
+static
+char *
+findSNPid(char *defline) {
+ char *ret = 0L;
+ int sta = 0;
+ int len = 0;
+ int i = 0;
+
+ if (fieldDelimiter == 0) {
+ for (len=1; defline[len] && !isspace(defline[len]); len++)
+ ;
+ } else {
+ for (len=1; defline[len] && defline[len] != fieldDelimiter; len++)
+ ;
+ }
+
+#if 0
+ // This was used for a set of SNPs with a non-standard defline
+ // structure. It returns the field between the first '|' and the
+ // next '_'.
+ //
+ for (len=1; defline[len] && defline[len] != '_'; len++)
+ ;
+ for (sta=len-1; sta > 0 && defline[sta] != '|'; sta--)
+ ;
+#endif
+
+ ret = new char [len+1];
+
+ for (i=sta; i<len-1; i++)
+ ret[i-sta] = defline[i+1];
+
+ ret[len-sta-1] = 0;
+
+ return(ret);
+}
+
+static
+char *
+findGENid(char *defline) {
+ char *ret = 0L;
+ int len = 0;
+ int i = 0;
+
+ for (len=1; defline[len] && !isspace(defline[len]); len++)
+ ;
+
+ ret = new char [len+1];
+
+ for (i=0; i<len-1; i++)
+ ret[i] = defline[i+1];
+
+ ret[len-1] = 0;
+
+ return(ret);
+}
+
+
+
+static
+int
+findPosition(char *defline) {
+ char *p = 0L;
+
+ p = strstr(defline, posTag);
+
+ // Look for standard posTags if we didn't find the one the user wanted.
+
+ if (p == 0L)
+ p = strstr(defline, "allelePos=");
+ if (p == 0L)
+ p = strstr(defline, "/pos=");
+ if (p == 0L)
+ fprintf(stderr, "posTag '%s' (also looked for 'allelePos=' and '/pos=') not found in defline '%s'!\n", posTag, defline), exit(1);
+
+ while (*p && !isdigit(*p))
+ p++;
+
+ if (*p == 0) {
+ fprintf(stderr, "Found posTag '%s' in defline '%s', but didn't find any numbers!\n", posTag, defline);
+ exit(1);
+ }
+
+ return(atoi(p) + positionOffset);
+}
+
+
+
+
+// Returns 1 if SNP was valid and printed,
+// 0 otherwise.
+//
+static
+int
+printSNP(FILE *F, sim4polish *p) {
+ uint32 pos = findPosition(p->_estDefLine);
+ uint32 exonWithSNP = ~uint32ZERO;
+ uint32 i = 0;
+ uint32 seqOffset = 0;
+
+ // If the match is complement, then the alignment is printed using
+ // the reverse complemented SNP sequence, and so we need to find
+ // the offset at the end of the sequence (not always the same as
+ // the offset at the start of the sequence).
+ //
+ // XXX: Previous version had this as "p->_estLen - pos + siz", which
+ // seems wrong. This version does what appears to be reverse
+ // complement - size. I don't understand if this is a "size" or
+ // just a "1" thing.
+ //
+ seqOffset = pos;
+ if (p->_matchOrientation == SIM4_MATCH_COMPLEMENT)
+ seqOffset = p->_estLen - pos - 1;
+
+ // Find the exon with the SNP
+ //
+ for (i=0; i<p->_numExons; i++)
+ if (((p->_exons[i]._estFrom-1) <= seqOffset) && (seqOffset <= (p->_exons[i]._estTo-1)))
+ exonWithSNP = i;
+
+ if (exonWithSNP == ~uint32ZERO)
+ return(0);
+
+ // If we are printing to a file, continue to find the location, otherwise,
+ // just return.
+ //
+ if (F) {
+ char *SNPid = findSNPid(p->_estDefLine);
+ char *GENid = findGENid(p->_genDefLine);
+
+ char SNPbase = 0;
+ char GENbase = 0;
+
+ // Now, we examine the alignment strings to decide exactly
+ // where the SNP is located in the genomic.
+ //
+ // bpToExaine - the number of bases we need to skip in the
+ // alignment (counted in the snp), +1 because we are currently at
+ // the bp before the alignment (so we need to skip one more space).
+ //
+ // XXX: these used to be int!
+ //
+ uint32 bpToExamine = seqOffset - (p->_exons[exonWithSNP]._estFrom - 1) + 1;
+ uint32 examinePos = 0;
+ uint32 genPosition = p->_exons[exonWithSNP]._genFrom - 1;
+
+ // Recent runs of dbSNP showed that we are off by one (too many if forward, too few if complement). This is a hack to fix it.
+ //
+ if (p->_matchOrientation == SIM4_MATCH_COMPLEMENT)
+ bpToExamine++;
+ else
+ bpToExamine--;
+
+ while (bpToExamine > 0) {
+
+ // If the SNP alignment eats up a base pair, decrement
+ // the number of bp left to examine.
+ //
+ if (p->_exons[exonWithSNP]._estAlignment[examinePos] != '-')
+ bpToExamine--;
+
+ // If the the genomic alignment is not a gap, increment the
+ // position.
+ //
+ if (p->_exons[exonWithSNP]._genAlignment[examinePos] != '-')
+ genPosition++;
+
+ examinePos++;
+ }
+
+ // Adjust the quality values, treating the SNP as a match always.
+ //
+ SNPbase = p->_exons[exonWithSNP]._estAlignment[examinePos-1];
+ GENbase = p->_exons[exonWithSNP]._genAlignment[examinePos-1];
+
+ p->_exons[exonWithSNP]._estAlignment[examinePos-1] = 'A';
+ p->_exons[exonWithSNP]._genAlignment[examinePos-1] = 'A';
+
+ p->s4p_updateAlignmentScores();
+
+ p->_exons[exonWithSNP]._estAlignment[examinePos-1] = SNPbase;
+ p->_exons[exonWithSNP]._genAlignment[examinePos-1] = GENbase;
+
+
+ if (outputFormat == 1) {
+ fprintf(F, "%s %s "uint32FMT" %c/%c %s global["uint32FMT" "uint32FMT"] exon["uint32FMT" "uint32FMT" "uint32FMT" "uint32FMT"]\n",
+ SNPid,
+ GENid,
+ genPosition,
+ SNPbase,
+ GENbase,
+ (p->_matchOrientation == SIM4_MATCH_FORWARD) ? "forward" : "complement",
+ p->_percentIdentity,
+ p->_querySeqIdentity,
+ p->_numExons,
+ exonWithSNP,
+ p->_exons[exonWithSNP]._percentIdentity,
+ (uint32)floor(100.0 * (double)p->_exons[exonWithSNP]._numMatches / (double)p->_estLen));
+ } else if (outputFormat == 2) {
+
+ // The format is all on one line, data fields separated by tab.
+ // No spaces -- "sa=C" instead of "sa = C"
+ //
+ // SNPid
+ // GENid
+ // genomic position of SNP
+ // sa=c -- snp allele
+ // ga=c -- genome allele
+ // mo={f|r} -- mapping orientation
+ // pi=n -- percent identity
+ // pc=n -- percent coverage
+ // nb=n -- number of alignment blocks
+ // bl=n -- alignment block with the snp
+ // bp=n -- position of the snp in the alignment block
+ // bi=n -- percent identity of the block
+ // bc=n -- percent coverage of the block
+ //
+ // The first three items are mandatory, are always in that
+ // order, and are always the first three. The others are
+ // optional, and can occur in any order. There might be more
+ // present than listed here.
+ //
+ // The order and content should be consistent for any given
+ // version of the software.
+ //
+ fprintf(F, "%s %s "uint32FMT" sa=%c ga=%c mo=%c pi="uint32FMT" pc="uint32FMT" nb="uint32FMT" bl="uint32FMT" bp="uint32FMT" bi="uint32FMT" bc="uint32FMT"\n",
+ "a", //SNPid,
+ "b", //GENid,
+ genPosition,
+ p->_exons[exonWithSNP]._estAlignment[examinePos-1], // sa
+ p->_exons[exonWithSNP]._genAlignment[examinePos-1], // ga
+ (p->_matchOrientation == SIM4_MATCH_FORWARD) ? 'f' : 'r', // mo
+ p->_percentIdentity, // pi
+ p->_querySeqIdentity, // pc
+ p->_numExons, // nb
+ exonWithSNP, // bl
+ examinePos, // bp
+ p->_exons[exonWithSNP]._percentIdentity, // bi
+ (uint32)floor(100.0 * (double)p->_exons[exonWithSNP]._numMatches / (double)p->_estLen)); // bc
+ } else {
+ }
+
+ delete [] SNPid;
+ delete [] GENid;
+ }
+
+ return(1);
+}
+
+
+
+// Just a wrapper around the real best picker, so that we can easily
+// destroy polishes when we're done.
+//
+static
+void
+parseSNP(sim4polish **p, int pNum) {
+ int numMulti = 0;
+ int i;
+
+ // Count the number of matches that have more than one exon
+ //
+ for (i=0; i<pNum; i++)
+ if (p[i]->_numExons > 1)
+ numMulti++;
+
+ if (pNum == 1) {
+
+ //
+ // Exactly one match for this SNP
+ //
+
+ if (numMulti == 0) {
+
+ // Match has one exon
+
+ if (singleSingleFile)
+ singleSingleFile->writeAlignment(p[0]);
+
+ if (printSNP(validSNPMapFile, p[0])) {
+ sspass++;
+ } else {
+ ssfail++;
+ if (failedSNPMapFile)
+ failedSNPMapFile->writeAlignment(p[0]);
+ }
+ } else {
+
+ // Match has more than one exon
+
+ if (singleMultiFile)
+ singleMultiFile->writeAlignment(p[0]);
+
+ if (printSNP(validSNPMapFile, p[0])) {
+ smpass++;
+ } else {
+ smfail++;
+ if (failedSNPMapFile)
+ failedSNPMapFile->writeAlignment(p[0]);
+ }
+ }
+ } else {
+
+ //
+ // More than one match for this SNP
+ //
+
+ if (numMulti == 0) {
+ int pass=0, fail=0;
+
+ // All the matches are single exon
+
+ if (multiSingleFile)
+ for (i=0; i<pNum; i++)
+ multiSingleFile->writeAlignment(p[i]);
+
+ for (i=0; i<pNum; i++)
+ if (printSNP(validSNPMapFile, p[i])) {
+ pass++;
+ } else {
+ fail++;
+ if (failedSNPMapFile)
+ failedSNPMapFile->writeAlignment(p[i]);
+ }
+
+ if (pass==1) sspass++;
+ if (pass > 1) mspass++;
+ if (!pass && fail) msfail++;
+ } else {
+ int pass=0, fail=0;
+
+ // At least one match has more than one exon -- the correct one
+ // might be a single exon, but we don't know which is which.
+
+ if (multiMultiFile)
+ for (i=0; i<pNum; i++)
+ multiMultiFile->writeAlignment(p[i]);
+
+ for (i=0; i<pNum; i++)
+ if (printSNP(validSNPMapFile, p[i])) {
+ pass++;
+ } else {
+ fail++;
+ if (failedSNPMapFile)
+ failedSNPMapFile->writeAlignment(p[i]);
+ }
+
+ if (pass==1) smpass++;
+ if (pass > 1) mmpass++;
+ if (!pass && fail) mmfail++;
+ }
+ }
+
+ for (i=0; i<pNum; i++)
+ delete p[i];
+}
+
+
+int
+main(int argc, char **argv) {
+ int pNum = 0;
+ int pAlloc = 8388608;
+ uint32 estID = 0;
+
+ uint32 percentID = 0;
+ uint32 percentCO = 0;
+
+ validSNPMapFile = 0L;
+ failedSNPMapFile = 0L;
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-i", 2) == 0) {
+ percentID = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-c", 2) == 0) {
+ percentCO = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-F", 2) == 0) {
+ failedSNPMapFile = new sim4polishWriter(argv[++arg], sim4polishS4DB);
+
+ } else if (strncmp(argv[arg], "-O", 2) == 0) {
+ errno = 0;
+ validSNPMapFile = fopen(argv[++arg], "w");
+ if (errno)
+ fprintf(stderr, "Couldn't open '%s' for writing: %s\n", argv[arg], strerror(errno)), exit(1);
+
+ } else if (strncmp(argv[arg], "-D", 2) == 0) {
+ char name[FILENAME_MAX];
+
+ arg++;
+
+ sprintf(name, "%s-multi-multi", argv[arg]);
+ multiMultiFile = new sim4polishWriter(name, sim4polishS4DB);
+
+ sprintf(name, "%s-multi-single", argv[arg]);
+ multiSingleFile = new sim4polishWriter(name, sim4polishS4DB);
+
+ sprintf(name, "%s-single-multi", argv[arg]);
+ singleMultiFile = new sim4polishWriter(name, sim4polishS4DB);
+
+ sprintf(name, "%s-single-single", argv[arg]);
+ singleSingleFile = new sim4polishWriter(name, sim4polishS4DB);
+
+ } else if (strncmp(argv[arg], "-d", 2) == 0) {
+ fieldDelimiter = argv[++arg][0];
+
+ } else if (strncmp(argv[arg], "-p", 2) == 0) {
+ posTag = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-s", 2) == 0) {
+ sizeTag = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-o", 2) == 0) {
+ positionOffset = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-format", 2) == 0) {
+ outputFormat = atoi(argv[++arg]);
+
+ } else {
+ fprintf(stderr, "unknown option: %s\n", argv[arg]);
+ err++;
+ }
+
+ arg++;
+ }
+
+ if (err) {
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
+ fprintf(stderr, " -i min-identity filter matches on percent identity\n");
+ fprintf(stderr, " -c min-coverage filter matches on percent coverage\n");
+ fprintf(stderr, " -F failed save matches that do not contain the\n");
+ fprintf(stderr, " to the file 'failed'\n");
+ fprintf(stderr, " -O output save the parsed SNPs to the file\n");
+ fprintf(stderr, " 'output'\n");
+ fprintf(stderr, " -D prefix report debugging stuff into files\n");
+ fprintf(stderr, " prefixed with 'prefix'\n");
+ fprintf(stderr, " -d delimiter Use the single character delimiter as\n");
+ fprintf(stderr, " the end of the defline ID field. The\n");
+ fprintf(stderr, " default is to split on any whitespace.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -s sizeTag Use this tag as the size of the snp.\n");
+ fprintf(stderr, " '/size=' is tried by default.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -p posTag Use this tag as the position of the snp.\n");
+ fprintf(stderr, " 'allelePos=' and '/pos=' are tried by\n");
+ fprintf(stderr, " default, and if posTag is not found.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " TAGS: The number immediately after the first\n");
+ fprintf(stderr, " occurance of the tag will be used.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -o offset An additive offset to the SNP position.\n");
+ fprintf(stderr, " The default is 0.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -format n 1 - use the original (default) format\n");
+ fprintf(stderr, " 2 - use an extended format, includes the\n");
+ fprintf(stderr, " position in the alignment string\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -h Show this help.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "\n" );
+ fprintf(stderr, " only -O is required. Input is read from stdin.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " NOTE! Sizes and sizeTag is NOT IMPLEMENTED!\n");
+ fprintf(stderr, " All SNPs are of size == 1\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " If you parse base-based SNPs, the result is returned base-based.\n");
+ fprintf(stderr, " You should use an ofset of 0.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " If you parse space-based SNPs, the result is returned base-based.\n");
+ fprintf(stderr, " You should use an offset of 1.\n");
+ fprintf(stderr, "\n");
+ }
+
+ if ((outputFormat != 1) && (outputFormat != 2)) {
+ fprintf(stderr, "Invalid output format. Must be 1 or 2.\n");
+ exit(1);
+ }
+
+
+ // Read polishes, parsing when we see a change in the estID.
+ // Really, we could parse one by one, but it's nice to know if the
+ // thing mapped more than once.
+ //
+ // We could also extend this to discard matches that look
+ // suspicious -- or maybe pick the single best match for each.
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish **p = new sim4polish * [pAlloc];
+ sim4polish *q = 0L;
+
+ while (R->nextAlignment(q)) {
+ if (q->_estID < estID) {
+ fprintf(stderr, "ERROR: Polishes not sorted by SNP idx! this="uint32FMT", looking for "uint32FMT"\n",
+ q->_estID, estID);
+ exit(1);
+ }
+
+ if ((q->_estID != estID) && (pNum > 0)) {
+ parseSNP(p, pNum);
+ pNum = 0;
+ }
+
+ if (pNum >= pAlloc) {
+ sim4polish **P = new sim4polish * [pAlloc * 2];
+ memcpy(p, P, sizeof(sim4polish *) * pAlloc);
+ delete [] p;
+ p = P;
+ pAlloc *= 2;
+ }
+
+ estID = q->_estID;
+
+ if ((q->_percentIdentity >= percentID) &&
+ (q->_querySeqIdentity >= percentCO)) {
+ p[pNum++] = q;
+ } else {
+ delete q;
+ }
+
+ q = 0L; // Otherwise we delete the one we just saved!
+ }
+
+ if (pNum > 0)
+ parseSNP(p, pNum);
+
+ fprintf(stdout, "SNPs with:\n");
+ fprintf(stdout, " single hit, single exon: %6d\n", sspass);
+ fprintf(stdout, " single hit, multiple exons: %6d\n", smpass);
+ fprintf(stdout, " multiple hits, single exon: %6d\n", mspass);
+ fprintf(stdout, " multiple hits, multiple exons: %6d\n", mmpass);
+ fprintf(stdout, "SNPs that failed:\n");
+ fprintf(stdout, " single hit, single exon: %6d\n", ssfail);
+ fprintf(stdout, " single hit, multiple exons: %6d\n", smfail);
+ fprintf(stdout, " multiple hits, single exon: %6d\n", msfail);
+ fprintf(stdout, " multiple hits, multiple exons: %6d\n", mmfail);
+
+ fclose(validSNPMapFile);
+ delete failedSNPMapFile;
+
+ delete multiMultiFile;
+ delete multiSingleFile;
+ delete singleMultiFile;
+ delete singleSingleFile;
+
+ return(0);
+}
+
diff --git a/sim4dbutils/pickBestPair.C b/sim4dbutils/pickBestPair.C
new file mode 100644
index 0000000..cccb24c
--- /dev/null
+++ b/sim4dbutils/pickBestPair.C
@@ -0,0 +1,599 @@
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "bio.h"
+#include "sim4.H"
+
+#include <vector>
+#include <map>
+#include <string>
+using namespace std;
+
+#define SEQNAME_MAX 64
+
+class mapResult {
+public:
+ uint32 seqIdx;
+ char seqName[SEQNAME_MAX];
+
+ uint32 refIdx;
+ char refName[SEQNAME_MAX];
+ uint32 refBgn;
+ uint32 refEnd;
+
+ bool forward;
+};
+
+class readData {
+public:
+ readData() {
+ cloneIndex = 999999999;
+ isFirstMate = 0;
+ };
+ readData(uint32 index, uint32 first) {
+ cloneIndex = index;
+ isFirstMate = first;
+ };
+
+ uint32 cloneIndex : 31;
+ uint32 isFirstMate : 1;
+};
+
+map<string,readData> nameToIndex;
+uint32 nameToIndexIndex = 0;
+
+
+bool
+readMR(FILE *in, mapResult &mr) {
+ static char line[1024];
+ static splitToWords W;
+
+ // Skip header.
+ if (ftell(in) == 0) {
+ fgets(line, 1024, in);
+ }
+
+ fgets(line, 1024, in);
+
+ if (feof(in))
+ return(false);
+
+ chomp(line);
+ W.split(line);
+
+ if (strlen(W[0]) >= SEQNAME_MAX)
+ W[0][SEQNAME_MAX-1] = 0;
+
+ if (strlen(W[6]) >= SEQNAME_MAX)
+ W[6][SEQNAME_MAX-1] = 0;
+
+ assert(strlen(W[0]) < SEQNAME_MAX);
+ assert(strlen(W[6]) < SEQNAME_MAX);
+
+ mr.seqIdx = W(1);
+ mr.refIdx = W(7);
+
+ mr.refBgn = W(8);
+ mr.refEnd = W(9);
+
+ mr.forward = (W(4) < W(5)) ? true : false;
+
+ strcpy(mr.seqName, W[0]);
+ strcpy(mr.refName, W[6]);
+
+ return(true);
+}
+
+
+mapResult &
+readMRsim4db(sim4polish *p, mapResult &mr) {
+
+ if (strlen(p->_estDefLine) >= SEQNAME_MAX)
+ p->_estDefLine[SEQNAME_MAX-1] = 0;
+
+ if (strlen(p->_genDefLine) >= SEQNAME_MAX)
+ p->_genDefLine[SEQNAME_MAX-1] = 0;
+
+ assert(strlen(p->_estDefLine) < SEQNAME_MAX);
+ assert(strlen(p->_genDefLine) < SEQNAME_MAX);
+
+ mr.seqIdx = p->_estID;
+ mr.refIdx = p->_genID;
+
+ mr.refBgn = p->_exons[0]._genFrom - 1;
+ mr.refEnd = p->_exons[0]._genTo;
+
+ mr.forward = (p->_matchOrientation == SIM4_MATCH_FORWARD) ? true : false;
+
+ strcpy(mr.seqName, p->_estDefLine);
+ strcpy(mr.refName, p->_genDefLine);
+
+ return(mr);
+}
+
+
+bool
+readMRcoords(FILE *in, mapResult &mr) {
+ static char line[1024];
+ static splitToWords W;
+
+ // Skip header.
+ if (ftell(in) == 0) {
+ fgets(line, 1024, in);
+ fgets(line, 1024, in);
+ fgets(line, 1024, in);
+ fgets(line, 1024, in);
+ }
+
+ fgets(line, 1024, in);
+
+ if (feof(in))
+ return(false);
+
+ chomp(line);
+ W.split(line);
+
+ // Since we don't have indexes in coords files, we must assign them based on
+ // object names.
+
+ // But we use "same index" to infer pairing. This won't work.
+
+ string refNam(W[9]);
+ string seqNam(W[10]);
+
+ if (nameToIndex.find(refNam) == nameToIndex.end()) {
+ nameToIndex[refNam] = readData(nameToIndexIndex++, false);
+ }
+
+ if (nameToIndex.find(seqNam) == nameToIndex.end()) {
+ fprintf(stderr, "1 failed to find mate index for read '%s'\n", W[9]);
+ }
+
+ uint32 seqIdx = nameToIndex[seqNam].cloneIndex;
+ uint32 refIdx = nameToIndex[refNam].cloneIndex;
+
+
+ if (strlen(W[9]) >= SEQNAME_MAX)
+ W[9][SEQNAME_MAX-1] = 0;
+
+ if (strlen(W[10]) >= SEQNAME_MAX)
+ W[10][SEQNAME_MAX-1] = 0;
+
+ assert(strlen(W[9]) < SEQNAME_MAX);
+ assert(strlen(W[10]) < SEQNAME_MAX);
+
+ mr.seqIdx = seqIdx;
+ mr.refIdx = refIdx;
+
+ mr.refBgn = W(0);
+ mr.refEnd = W(1);
+
+ mr.forward = (W(2) < W(3)) ? true : false;
+
+ strcpy(mr.seqName, W[10]);
+ strcpy(mr.refName, W[9]);
+
+ return(true);
+}
+
+
+
+bool
+readMRcoords(FILE *in, mapResult &mr, bool &is1) {
+ static char line[1024];
+ static splitToWords W;
+
+ // Skip header.
+ if (ftell(in) == 0) {
+ fgets(line, 1024, in);
+ fgets(line, 1024, in);
+ fgets(line, 1024, in);
+ fgets(line, 1024, in);
+ }
+
+ fgets(line, 1024, in);
+
+ if (feof(in))
+ return(false);
+
+ chomp(line);
+ W.split(line);
+
+ // Since we don't have indexes in coords files, we must assign them based on
+ // object names.
+
+ // But we use "same index" to infer pairing. This won't work.
+
+ string refNam(W[9]);
+ string seqNam(W[10]);
+
+ if (nameToIndex.find(refNam) == nameToIndex.end()) {
+ nameToIndex[refNam] = readData(nameToIndexIndex++, false);
+ }
+
+ if (nameToIndex.find(seqNam) == nameToIndex.end()) {
+ fprintf(stderr, "2 failed to find mate index for read '%s'\n", W[10]);
+ for (uint32 i=0; i<12; i++)
+ fprintf(stderr, "%2d -- '%s'\n", i, W[i]);
+ exit(1);
+ }
+
+ uint32 seqIdx = nameToIndex[seqNam].cloneIndex;
+ uint32 refIdx = nameToIndex[refNam].cloneIndex;
+
+ is1 = nameToIndex[seqNam].isFirstMate;
+
+
+ if (strlen(W[9]) >= SEQNAME_MAX)
+ W[9][SEQNAME_MAX-1] = 0;
+
+ if (strlen(W[10]) >= SEQNAME_MAX)
+ W[10][SEQNAME_MAX-1] = 0;
+
+ assert(strlen(W[9]) < SEQNAME_MAX);
+ assert(strlen(W[10]) < SEQNAME_MAX);
+
+ mr.seqIdx = seqIdx;
+ mr.refIdx = refIdx;
+
+ mr.refBgn = W(0);
+ mr.refEnd = W(1);
+
+ mr.forward = (W(2) < W(3)) ? true : false;
+
+ strcpy(mr.seqName, W[10]);
+ strcpy(mr.refName, W[9]);
+
+ return(true);
+}
+
+
+
+
+
+
+int
+main(int argc, char **argv) {
+ vector<char *> in1extent, in1sim4db, in1coords, incoords;
+ vector<char *> in2extent, in2sim4db, in2coords;
+ vector<char *> mateMaps;
+ char *out = NULL;
+ char orient = 0;
+ uint32 distMin = 0;
+ uint32 distMax = uint32MAX;
+
+ double minIdent = 0;
+ double minLength = 0;
+ double minCoverage = 0;
+
+ bool allowDups = false;
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-1extent") == 0)
+ while ((arg+1 < argc) && (argv[arg+1][0] != '-'))
+ in1extent.push_back(argv[++arg]);
+ else if (strcmp(argv[arg], "-2extent") == 0)
+ while ((arg+1 < argc) && (argv[arg+1][0] != '-'))
+ in2extent.push_back(argv[++arg]);
+
+ else if (strcmp(argv[arg], "-1sim4db") == 0)
+ while ((arg+1 < argc) && (argv[arg+1][0] != '-'))
+ in1sim4db.push_back(argv[++arg]);
+ else if (strcmp(argv[arg], "-2sim4db") == 0)
+ while ((arg+1 < argc) && (argv[arg+1][0] != '-'))
+ in2sim4db.push_back(argv[++arg]);
+
+ else if (strcmp(argv[arg], "-1coords") == 0)
+ while ((arg+1 < argc) && (argv[arg+1][0] != '-'))
+ in1coords.push_back(argv[++arg]);
+ else if (strcmp(argv[arg], "-2coords") == 0)
+ while ((arg+1 < argc) && (argv[arg+1][0] != '-'))
+ in2coords.push_back(argv[++arg]);
+
+ else if (strcmp(argv[arg], "-coords") == 0)
+ while ((arg+1 < argc) && (argv[arg+1][0] != '-'))
+ incoords.push_back(argv[++arg]);
+
+ else if (strcmp(argv[arg], "-matemap") == 0)
+ while ((arg+1 < argc) && (argv[arg+1][0] != '-'))
+ mateMaps.push_back(argv[++arg]);
+
+ else if (strcmp(argv[arg], "-insert") == 0) {
+ orient = argv[++arg][0];
+ distMin = atoi(argv[++arg]);
+ distMax = atoi(argv[++arg]);
+ }
+
+ else if (strcmp(argv[arg], "-minident") == 0)
+ minIdent = atoi(argv[++arg]);
+
+ else if (strcmp(argv[arg], "-minlength") == 0)
+ minLength = atoi(argv[++arg]);
+
+ else if (strcmp(argv[arg], "-mincoverage") == 0)
+ minCoverage = atoi(argv[++arg]);
+
+ else if (strcmp(argv[arg], "-allowduplicates") == 0)
+ allowDups = true;
+
+ else if (strcmp(argv[arg], "-o") == 0)
+ out = argv[++arg];
+
+ else
+ err++;
+
+ arg++;
+ }
+ if (out == NULL) {
+ fprintf(stderr, "usage: %s -1 in1.extent -2 in2.extent -o prefix\n", argv[0]);
+ exit(1);
+ }
+
+ vector<mapResult> mr1;
+ vector<mapResult> mr2;
+
+ mapResult mr;
+
+ // Load mate map if needed
+
+ if (mateMaps.size() > 0) {
+ for (uint32 mm=0; mm<mateMaps.size(); mm++) {
+ uint32 numLoaded = 0;
+
+ fprintf(stderr, "Loading mate pairings from '%s'.\n", mateMaps[mm]);
+
+ errno = 0;
+ FILE *IN = fopen(mateMaps[mm], "r");
+ if (errno)
+ fprintf(stderr, "Failed to open mate map '%s': %s\n", mateMaps[mm], strerror(errno)), exit(1);
+
+ char LL[10240];
+ fgets(LL, 10240, IN);
+
+ while (!feof(IN)) {
+ chomp(LL);
+ splitToWords W(LL);
+
+ nameToIndex[string(W[0])] = readData(nameToIndexIndex, true);
+ nameToIndex[string(W[1])] = readData(nameToIndexIndex++, false);
+
+ numLoaded++;
+
+ fgets(LL, 10240, IN);
+ }
+
+ fprintf(stderr, "Loaded %u mate pairings from '%s', total %u.\n", numLoaded, mateMaps[mm], nameToIndexIndex);
+ }
+ }
+
+ // Load alignments
+
+ for (uint32 ii=0; ii<in1extent.size(); ii++) {
+ fprintf(stderr, "Loading alignments from '%s'\n", in1extent[ii]);
+ FILE *IN = fopen(in1extent[ii], "r");
+ while (readMR(IN, mr) == true)
+ mr1.push_back(mr);
+ fclose(IN);
+ }
+
+ for (uint32 ii=0; ii<in1sim4db.size(); ii++) {
+ fprintf(stderr, "Loading alignments from '%s'\n", in1sim4db[ii]);
+ sim4polishReader *IN = new sim4polishReader(in1sim4db[ii]);
+ sim4polish *p = NULL;
+ while (IN->nextAlignment(p)) {
+ mr1.push_back(readMRsim4db(p, mr));
+ }
+ delete IN;
+ }
+
+ for (uint32 ii=0; ii<in1coords.size(); ii++) {
+ fprintf(stderr, "Loading alignments from '%s'\n", in1coords[ii]);
+ FILE *IN = fopen(in1coords[ii], "r");
+ while (readMRcoords(IN, mr) == true)
+ mr1.push_back(mr);
+ fclose(IN);
+ }
+
+
+
+ for (uint32 ii=0; ii<in2extent.size(); ii++) {
+ fprintf(stderr, "Loading alignments from '%s'\n", in2extent[ii]);
+ FILE *IN = fopen(in2extent[ii], "r");
+ while (readMR(IN, mr) == true)
+ mr2.push_back(mr);
+ fclose(IN);
+ }
+
+ for (uint32 ii=0; ii<in2sim4db.size(); ii++) {
+ fprintf(stderr, "Loading alignments from '%s'\n", in2sim4db[ii]);
+ sim4polishReader *IN = new sim4polishReader(in2sim4db[ii]);
+ sim4polish *p = NULL;
+ while (IN->nextAlignment(p)) {
+ mr2.push_back(readMRsim4db(p, mr));
+ }
+ delete IN;
+ }
+
+ for (uint32 ii=0; ii<in2coords.size(); ii++) {
+ fprintf(stderr, "Loading alignments from '%s'\n", in2coords[ii]);
+ FILE *IN = fopen(in2coords[ii], "r");
+ while (readMRcoords(IN, mr) == true)
+ mr1.push_back(mr);
+ fclose(IN);
+ }
+
+
+
+ for (uint32 ii=0; ii<incoords.size(); ii++) {
+ fprintf(stderr, "Loading alignments from '%s'\n", incoords[ii]);
+ FILE *IN = fopen(incoords[ii], "r");
+ bool is1;
+
+ while (readMRcoords(IN, mr, is1) == true)
+ if (is1)
+ mr1.push_back(mr);
+ else
+ mr2.push_back(mr);
+
+ fclose(IN);
+ }
+
+ fprintf(stderr, "Loaded %lu '1' alignments.\n", mr1.size());
+ fprintf(stderr, "Loaded %lu '2' alignments.\n", mr2.size());
+
+ char name[10240];
+
+ sprintf(name, "%s.pairLog", out);
+ FILE *LOG = fopen(name, "w");
+
+ sprintf(name, "%s.duplicates", out);
+ FILE *DUP = fopen(name, "w");
+
+ sprintf(name, "%s.stats", out);
+ FILE *STA = fopen(name, "w");
+
+ uint32 mr1bgn = 0;
+ uint32 mr1end = 0;
+ uint32 mr1END = mr1.size();
+
+ uint32 mr2bgn = 0;
+ uint32 mr2end = 0;
+ uint32 mr2END = mr2.size();
+
+ map<char,uint32> totalPairs;
+ map<char,uint32> sizedPairs;
+
+ while ((mr1bgn < mr1END) && (mr2bgn < mr2END)) {
+
+ if ((mr1[mr1bgn].seqIdx < mr2[mr2bgn].seqIdx) && (mr1bgn < mr1END))
+ mr1bgn++;
+
+ if ((mr2[mr2bgn].seqIdx < mr1[mr1bgn].seqIdx) && (mr2bgn < mr2END))
+ mr2bgn++;
+
+ if (mr1[mr1bgn].seqIdx != mr2[mr2bgn].seqIdx)
+ // SequenceA 1 3 5 7 8
+ // SequenceB 2 4 6 8
+ // 1st pass, A increases to 3, B increases to 4
+ // 2nd pass, A increases to 5, B increases to 6
+ // 3rd pass, A increases to 7, B increases to 8
+ // 4th pass, A increases to 8, B doesn't change.
+ continue;
+
+ assert(mr1[mr1bgn].seqIdx == mr2[mr2bgn].seqIdx);
+
+ mr1end = mr1bgn + 1;
+ mr2end = mr2bgn + 1;
+
+ while (mr1[mr1bgn].seqIdx == mr1[mr1end].seqIdx)
+ mr1end++;
+
+ while (mr2[mr2bgn].seqIdx == mr2[mr2end].seqIdx)
+ mr2end++;
+
+ // Group of reads from mr1bgn-mr1end and mr2bgn-mr2end need to be compared.
+
+ if ((mr1end - mr1bgn > 1) &&
+ (mr2end - mr2bgn > 1)) {
+ fprintf(DUP, "%s\t%u\t%s\t%u\n",
+ mr1[mr1bgn].seqName, mr1end - mr1bgn,
+ mr2[mr2bgn].seqName, mr2end - mr2bgn);
+ if (allowDups == false) {
+ mr1bgn = mr1end;
+ mr2bgn = mr2end;
+ }
+ }
+
+ // Now find all possible pairs.
+
+ for (uint32 i1=mr1bgn; i1<mr1end; i1++) {
+ for (uint32 i2=mr2bgn; i2<mr2end; i2++) {
+ if (mr1[i1].refIdx != mr2[i2].refIdx)
+ continue;
+
+ //validParis++;
+
+ uint32 df = 0;
+ uint32 dr = 0;
+ char ori = 'X';
+
+ if (mr1[i1].refBgn < mr2[i2].refEnd)
+ df = mr2[i2].refEnd - mr1[i1].refBgn;
+
+ if (mr2[i2].refBgn < mr1[i1].refEnd)
+ dr = mr1[i1].refEnd - mr2[i2].refBgn;
+
+ assert(df + dr > 0);
+
+ if (df > dr) {
+ if ((mr1[i1].forward == true) && (mr2[i2].forward == true))
+ ori = 'N';
+ if ((mr1[i1].forward == true) && (mr2[i2].forward == false))
+ ori = 'I';
+ if ((mr1[i1].forward == false) && (mr2[i2].forward == true))
+ ori = 'O';
+ if ((mr1[i1].forward == false) && (mr2[i2].forward == false))
+ ori = 'A';
+
+ totalPairs[ori]++;
+
+ if ((orient == 0) ||
+ ((ori == orient) && (distMin <= df) && (df <= distMax))) {
+ sizedPairs[ori]++;
+ fprintf(LOG, "%c "uint32FMT" "uint32FMT" %s ("uint32FMT","uint32FMT") "uint32FMT" %s ("uint32FMT","uint32FMT") "uint32FMT" %s\n",
+ ori,
+ df,
+ mr1[i1].seqIdx, mr1[i1].seqName, mr1[i1].refBgn, mr1[i1].refEnd,
+ mr2[i2].seqIdx, mr2[i2].seqName, mr2[i2].refBgn, mr2[i2].refEnd,
+ mr1[i1].refIdx, mr1[i1].refName);
+ }
+
+ } else {
+ if ((mr2[i2].forward == true) && (mr1[i1].forward == true))
+ ori = 'N';
+ if ((mr2[i2].forward == true) && (mr1[i1].forward == false))
+ ori = 'I';
+ if ((mr2[i2].forward == false) && (mr1[i1].forward == true))
+ ori = 'O';
+ if ((mr2[i2].forward == false) && (mr1[i1].forward == false))
+ ori = 'A';
+
+ totalPairs[ori]++;
+
+ if ((orient == 0) ||
+ ((ori == orient) && (distMin <= dr) && (dr <= distMax))) {
+ sizedPairs[ori]++;
+ fprintf(LOG, "%c "uint32FMT" "uint32FMT" %s ("uint32FMT","uint32FMT") "uint32FMT" %s ("uint32FMT","uint32FMT") "uint32FMT" %s\n",
+ ori,
+ dr,
+ mr2[i2].seqIdx, mr2[i2].seqName, mr2[i2].refBgn, mr2[i2].refEnd,
+ mr1[i1].seqIdx, mr1[i1].seqName, mr1[i1].refBgn, mr1[i1].refEnd,
+ mr2[i2].refIdx, mr2[i2].refName);
+ }
+ }
+ }
+ }
+
+ mr1bgn = mr1end;
+ mr2bgn = mr2end;
+ }
+
+ fprintf(STA, "alignments: "uint32FMT" "uint32FMT"\n", mr1END, mr2END);
+ fprintf(STA, "totalPairs[%c]: %u\n", 'N', totalPairs['N']);
+ fprintf(STA, "totalPairs[%c]: %u\n", 'I', totalPairs['I']);
+ fprintf(STA, "totalPairs[%c]: %u\n", 'O', totalPairs['O']);
+ fprintf(STA, "totalPairs[%c]: %u\n", 'A', totalPairs['A']);
+ fprintf(STA, "sizedPairs[%c]: %u\n", 'N', sizedPairs['N']);
+ fprintf(STA, "sizedPairs[%c]: %u\n", 'I', sizedPairs['I']);
+ fprintf(STA, "sizedPairs[%c]: %u\n", 'O', sizedPairs['O']);
+ fprintf(STA, "sizedPairs[%c]: %u\n", 'A', sizedPairs['A']);
+
+ fclose(LOG);
+ fclose(DUP);
+ fclose(STA);
+
+ exit(0);
+}
diff --git a/sim4dbutils/pickBestPolish.C b/sim4dbutils/pickBestPolish.C
new file mode 100644
index 0000000..431aa54
--- /dev/null
+++ b/sim4dbutils/pickBestPolish.C
@@ -0,0 +1,444 @@
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "bio.h"
+#include "sim4.H"
+
+// Picks the best polish (or set of polishes that are all of the same
+// best quality) for each cDNA.
+//
+// Validate mode will print out ALL input matches, in the following
+// format
+//
+// estid gaid nummatches percentid (genFr genTo %) () ()
+//
+// With a * somewhere to denote the best ones. Separate ESTs with
+// a dashed line.
+
+
+#define EPS_X 1
+#define EPS_N_ESTS 10
+#define EPS_N_MRNA 15
+#define EPS_I 3
+
+
+uint32 EPS_N = EPS_N_ESTS;
+uint32 doValidate = 0;
+
+sim4polishWriter *W = 0L;
+
+static
+void
+printPolishValidate(FILE *O, sim4polish *p, uint32 isBest) {
+
+ fprintf(O, uint32FMTW(8)" "uint32FMTW(8)" "uint32FMTW(4)" "uint32FMTW(4),
+ p->_estID, p->_genID, p->_percentIdentity, p->_numMatches);
+
+ for (uint32 i=0; i<p->_numExons; i++)
+ fprintf(O, " ("uint32FMTW(6)"/"uint32FMTW(6)" "uint32FMTW(6)"/"uint32FMTW(6)" "uint32FMTW(3)")",
+ p->_exons[i]._estFrom, p->_exons[i]._genFrom,
+ p->_exons[i]._estTo, p->_exons[i]._genTo,
+ p->_exons[i]._percentIdentity);
+
+ if (isBest)
+ fprintf(O, " *");
+
+ fprintf(O, "\n");
+}
+
+
+static
+void
+pickBestSlave(sim4polish **p, uint32 pNum) {
+ uint32 identitym = 0, nmatchesm = 0; // Best score for the mList
+ uint32 identityi = 0, nmatchesi = 0; // Best score the the iList
+ uint32 numExons = 0, numExonsi = 0, numExonsm = 0;
+ uint32 tmp_nmatches = 0;
+ double alpha;
+
+ // Difficult choice here....
+ //
+ if (pNum == 1) {
+ if (doValidate == 0)
+ W->writeAlignment(p[0]);
+ return;
+ }
+
+ if ((p[0]->_estID % 1287) == 0) {
+ fprintf(stderr, "Picking Best for estID="uint32FMT" with %5d choices.\r", p[0]->_estID, pNum);
+ fflush(stderr);
+ }
+
+ // Find the best percentIdentity and best numberOfMatches.
+ //
+ // identityi is the best percent identity of all the matches for this EST, and
+ // nmatchesi is the number of matches for the longest best identity match(es).
+ //
+ // nmatchesm is the best numMatches of all the matches for this EST, and
+ // identitym is the highest percent identity for the best numMatches match(es).
+
+ for (uint32 i=0; i<pNum; i++) {
+
+ if ((p[i]->_percentIdentity > identityi) ||
+ (p[i]->_percentIdentity == identityi && p[i]->_numMatches > nmatchesi)) {
+ identityi = p[i]->_percentIdentity;
+ nmatchesi = p[i]->_numMatches;
+ }
+
+ if ((p[i]->_numMatches > nmatchesm) ||
+ (p[i]->_numMatches == nmatchesm && p[i]->_percentIdentity > identitym)) {
+ nmatchesm = p[i]->_numMatches;
+ identitym = p[i]->_percentIdentity;
+ }
+
+ }
+
+
+ // Otherwise, if the best scores on both lists are the same, pick
+ // the matches with the largest number of exons
+ //
+ if ((identityi == identitym) &&
+ (nmatchesi == nmatchesm)) {
+
+ // Find the largest number of exons, allowing some margin in numMatches
+ //
+ numExonsi = 0;
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches >= nmatchesi) &&
+ (numExonsi < p[i]->_numExons))
+ numExonsi = p[i]->_numExons;
+
+ numExons = numExonsi;
+ tmp_nmatches = nmatchesi;
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches >= nmatchesi - EPS_N) &&
+ (numExons < p[i]->_numExons - EPS_X)) {
+ tmp_nmatches = p[i]->_numMatches;
+ numExons = p[i]->_numExons;
+ }
+
+ // Scan the entire list, printing the best stuff. We cannot just
+ // scan both the mList and iList, as those probably contain
+ // duplicates.
+
+ if (doValidate) {
+ if (tmp_nmatches == nmatchesi)
+ fprintf(stdout, "--------------------1 (Clear Winner)\n");
+ else
+ fprintf(stdout, "--------------------2 (Exon Clear Winner)\n");
+ for (uint32 i=0; i<pNum; i++)
+ printPolishValidate(stdout, p[i], ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches == tmp_nmatches) &&
+ (p[i]->_numExons == numExons)));
+ } else {
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches == tmp_nmatches) &&
+ (p[i]->_numExons == numExons))
+ W->writeAlignment(p[i]);
+ }
+
+ return;
+ }
+
+ // Start over. Find the best two percentIdentities. Break ties
+ // with numMatches.
+ //
+ // i will be the best,
+ // m will be the second best
+ //
+ identityi = identitym = 0;
+ nmatchesi = nmatchesm = 0;
+
+ for (uint32 i=0; i<pNum; i++) {
+
+ // Pick the two matches with the highest (different) percent
+ // identities; for each, pick the highest number of matches.
+ //
+ // First block: Have we found a new best percent identity?
+ // If so, save it, and shift former best to second best.
+ //
+ // Second and third blocks: make sure that we save the
+ // best numMatches for each.
+ //
+ if (p[i]->_percentIdentity > identityi) {
+ identitym = identityi;
+ nmatchesm = nmatchesi;
+
+ identityi = p[i]->_percentIdentity;
+ nmatchesi = p[i]->_numMatches;
+ } else if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches > nmatchesi)) {
+ nmatchesi = p[i]->_numMatches;
+ } else if ((p[i]->_percentIdentity < identityi) &&
+ ((p[i]->_percentIdentity > identitym) ||
+ ((p[i]->_percentIdentity == identitym) &&
+ (p[i]->_numMatches > nmatchesm)))) {
+ nmatchesm = p[i]->_numMatches;
+ identitym = p[i]->_percentIdentity;
+ }
+ }
+
+ // Now, 'i' is the highest percent identity, 'm' is the second
+ // highest. By definition, numMatches for 'i' is less than
+ // numMatches for 'm'.
+
+ // If the number of matches is different, output everything with the
+ // top score.
+ //
+ // We are guaranteed that the identities are the same. (I think)
+
+ if (nmatchesi >= nmatchesm) {
+
+ // Find the match(es) with the largest number of exons
+
+ numExonsi = 0;
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches >= nmatchesi) &&
+ (numExonsi < p[i]->_numExons))
+ numExonsi = p[i]->_numExons;
+
+
+ numExons = numExonsi;
+ tmp_nmatches = nmatchesi;
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches >= nmatchesi - EPS_N) &&
+ (numExons < p[i]->_numExons - EPS_X)) {
+ numExons = p[i]->_numExons;
+ tmp_nmatches = p[i]->_numMatches;
+ }
+
+ if (doValidate) {
+ if (tmp_nmatches == nmatchesi)
+ fprintf(stdout, "--------------------3 (?)\n");
+ else
+ fprintf(stdout, "--------------------4 (Exon ?)\n");
+ for (uint32 i=0; i<pNum; i++)
+ printPolishValidate(stdout, p[i], ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches == tmp_nmatches) &&
+ (p[i]->_numExons == numExons)));
+ } else {
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches == tmp_nmatches) &&
+ (p[i]->_numExons == numExons))
+ W->writeAlignment(p[i]);
+ }
+
+ return;
+ }
+
+ // Otherwise, compute alpha
+
+ alpha = ((nmatchesm - nmatchesi) /
+ ((nmatchesm / (double)identitym) -
+ (nmatchesi / (double)identityi)))/100;
+
+ // If alpha below a magic threshold, pick the shorter match.
+ //
+ if (alpha < 0.8) {
+
+ // Find the match(es) with the largest number of exons
+
+ numExons = tmp_nmatches = 0;
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches >= nmatchesi) &&
+ (numExons < p[i]->_numExons))
+ numExons = p[i]->_numExons;
+
+ if (doValidate) {
+ fprintf(stdout, "--------------------5 (alpha < 0.8)\n");
+ for (uint32 i=0; i<pNum; i++)
+ printPolishValidate(stdout, p[i], ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches == nmatchesi) &&
+ (p[i]->_numExons == numExons)));
+ } else {
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches == nmatchesi) &&
+ (p[i]->_numExons == numExons))
+ W->writeAlignment(p[i]);
+ }
+
+ return;
+ }
+
+ // Otherwise, pick the longer one.
+
+ // XXX: We can still check:
+ // if an internal gap is in N's
+ // the number of exons
+ // etc, etc.
+
+
+
+ // See if the smaller one has an internal gap that corresponds to
+ // N's in the genome. If so, assume that the exon mapped to the
+ // N's and pick the smaller.
+ //
+ // Need code to process genome, finding N's larger than some threshold.
+ // Output as 'genID beg end'
+
+
+ // Find the largest number of exons for each of the contenders
+
+ numExonsi = numExonsm = 0;
+ for (uint32 i=0; i<pNum; i++) {
+ if ((p[i]->_percentIdentity == identitym) &&
+ (p[i]->_numMatches == nmatchesm) &&
+ (numExonsm < p[i]->_numExons))
+ numExonsm = p[i]->_numExons;
+ else if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches == nmatchesi) &&
+ (numExonsi < p[i]->_numExons))
+ numExonsi = p[i]->_numExons;
+ }
+
+ if ((numExonsi > numExonsm + EPS_X) || (identityi > identitym + EPS_I)) {
+
+ if (doValidate) {
+ if (numExonsi > numExonsm + EPS_X)
+ fprintf(stdout, "--------------------6 (Exon Plus alpha > 0.8)\n");
+ else
+ fprintf(stdout, "--------------------7 (Pctid Plus alpha > 0.8)\n");
+
+ for (uint32 i=0; i<pNum; i++)
+ printPolishValidate(stdout, p[i], ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches == nmatchesi) &&
+ (p[i]->_numExons == numExonsi)));
+ } else {
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) &&
+ (p[i]->_numMatches == nmatchesi) &&
+ (p[i]->_numExons == numExonsi))
+ W->writeAlignment(p[i]);
+ }
+ } else {
+ numExons = numExonsm;
+ tmp_nmatches = nmatchesm;
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identitym) &&
+ (p[i]->_numMatches >= nmatchesm - EPS_N) &&
+ (numExons < p[i]->_numExons - EPS_X)) {
+ tmp_nmatches = p[i]->_numMatches;
+ numExons = p[i]->_numExons;
+ }
+
+ if (doValidate) {
+ if (numExons == numExonsm)
+ fprintf(stdout, "--------------------8 (alpha > 0.8)\n");
+ else
+ fprintf(stdout, "--------------------9 (Exon alpha > 0.8)\n");
+ for (uint32 i=0; i<pNum; i++)
+ printPolishValidate(stdout, p[i], ((p[i]->_percentIdentity == identitym) &&
+ (p[i]->_numMatches == tmp_nmatches) &&
+ (p[i]->_numExons == numExons)));
+ } else {
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identitym) &&
+ (p[i]->_numMatches == tmp_nmatches) &&
+ (p[i]->_numExons == numExons))
+ W->writeAlignment(p[i]);
+ }
+ }
+}
+
+
+// Just a wrapper around the real best picker, so that we can easily
+// destroy polishes when we're done.
+//
+static
+void
+pickBest(sim4polish **p, uint32 pNum) {
+
+ pickBestSlave(p, pNum);
+
+ for (uint32 i=0; i<pNum; i++)
+ delete p[i];
+}
+
+
+int
+main(int argc, char **argv) {
+ uint32 pNum = 0;
+ uint32 pAlloc = 8388608;
+ uint32 estID = ~uint32ZERO;
+
+ sim4polishStyle style = sim4polishStyleDefault;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-n", 2) == 0) {
+ pAlloc = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-mrna", 2) == 0) {
+ EPS_N = EPS_N_MRNA;
+ } else if (strncmp(argv[arg], "-ests", 2) == 0) {
+ EPS_N = EPS_N_ESTS;
+ } else if (strncmp(argv[arg], "-validate", 2) == 0) {
+ doValidate = 1;
+ } else if (strcmp(argv[arg], "-gff3") == 0) {
+ style = sim4polishGFF3;
+ } else {
+ fprintf(stderr, "unknown option: %s\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ if (isatty(fileno(stdin))) {
+ fprintf(stderr, "usage: %s [-mrna|-ests] [-validate] [-gff3] < file > file\n", argv[0]);
+
+ if (isatty(fileno(stdin)))
+ fprintf(stderr, "error: I cannot read polishes from the terminal!\n\n");
+
+ exit(1);
+ }
+
+ // Read polishes, picking the best when we see a change in the
+ // estID.
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish **p = new sim4polish * [pAlloc];
+ sim4polish *q = 0L;
+
+ W = new sim4polishWriter("-", style);
+
+ if (R->getsim4polishStyle() != style)
+ fprintf(stderr, "warning: input format and output format differ.\n");
+
+ while (R->nextAlignment(q)) {
+ if ((q->_estID != estID) && (pNum > 0)) {
+ pickBest(p, pNum);
+ pNum = 0;
+ }
+
+ if (pNum >= pAlloc) {
+ sim4polish **P = new sim4polish * [pAlloc * 2];
+ memcpy(p, P, sizeof(sim4polish *) * pAlloc);
+ delete [] p;
+ p = P;
+ pAlloc *= 2;
+ }
+
+ p[pNum++] = q;
+ estID = q->_estID;
+
+ q = 0L; // Otherwise we delete the alignment we just saved!
+ }
+
+ if (pNum > 0)
+ pickBest(p, pNum);
+
+ delete [] p;
+
+ delete R;
+ delete W;
+
+ return(0);
+}
+
diff --git a/sim4dbutils/pickUniquePolish-nhgri.C b/sim4dbutils/pickUniquePolish-nhgri.C
new file mode 100644
index 0000000..cbe7281
--- /dev/null
+++ b/sim4dbutils/pickUniquePolish-nhgri.C
@@ -0,0 +1,713 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+
+#include "bio.h"
+#include "sim4.H"
+
+// Kaz Kylheku <kaz at ashi.footprints.net> library.
+#include "kazlib/dict.h"
+#include "kazlib/except.h"
+#include "kazlib/hash.h"
+#include "kazlib/list.h"
+#include "kazlib/sfx.h"
+
+this is now dead code. It was unused and needed too much effort to update to sim4polishReader
+
+// Derived from pickBestPolish.c. We report only the single best
+// match, when it is obvious that there is EXACTLY one best match.
+//
+// Example: we have ten matches, but one is 3%id better than everyone
+// else -- that is an obviously unique match. The rest are noise.
+//
+// Example: ten matches, but they're all about the same quality -- within
+// a few percent id, and about the same length. We pick no match, and
+// silently discard all.
+//
+
+// Modified to:
+// a) not print out unique matches
+// b) print hangs
+// c) print q20 bases inside mapped regions, outside, etc.
+//
+// It needs two args -f seq.fasta -q qlt.fasta, both must have an
+// index -- build it for the seq.fasta, and COPY the index to
+// qlt.fastaidx. Be sure to 'touch -r seq.fasta qlt.fasta' to get
+// the same timestamp on the files.
+//
+//
+// Further modified to behave like pickUniquePolish (print unique matches
+// to a specific file).
+//
+// so: pickUniquePolish-nhgri needs to read polishes on stdin
+// -f qry.fasta -- query sequences for quality comparison
+// -q qlt.fasta --
+// -scores X.scores -- write stats to file X
+// -unique X.bz2 -- write uniquely mapped stuff to bzip2 file X.bz2
+// -filter X -- filter out polishes less than X% of the longest
+// -output X.bz2 -- write filtered polishes to bzip2 file X.bz2
+//
+// It has two modes:
+// -f -q -- just compute stats on the input.
+// all options -- filter, and compute stats.
+//
+// bzip2 -dc pass?/map-gen*-qlt$id.sim4db.bz2 |
+// $bin/fixPolishesIID -c $qry -g $gen |
+// $bin/filterPolishes -node -D |
+// $bin/sortPolishes -c -m 768 -t /scratch -v |
+// $bin/pickUniquePolish-nhgri > all-$id.scores
+// -o all-$id.sim4db.bz2
+// -F X
+// -f $qry
+// -q $qlt
+// -stats all-$id.scores |
+// -uniq all-$id.sim4db.bz2
+
+
+
+uint32 statOneMatch = 0;
+uint32 statConsistent = 0;
+uint32 statInconsistent = 0;
+uint32 statUnique = 0;
+uint32 statLost = 0;
+
+uint32 consistentTie = 0;
+uint32 consistentMatches = 0;
+uint32 consistentIdentity = 0;
+uint32 consistentTooShort = 0;
+uint32 consistentNot = 0;
+
+uint32 totLQ = 0;
+uint32 totMQ = 0;
+uint32 totRQ = 0;
+
+seqCache *SEQ = 0L;
+seqCache *QLT = 0L;
+
+double filter = 0.0;
+FILE *oFile = 0L;
+int oFileIsPipe = 0;
+FILE *sFile = 0L;
+FILE *uFile = 0L;
+bool doFiltering = false;
+
+
+void
+analyze(uint32 iid,
+ uint32 clrl,
+ uint32 clrr,
+ uint32 len,
+ bool isForward,
+ char type) {
+
+ seqInCore *Q = QLT->getSequenceInCore(iid);;
+
+ char *q = Q->sequence();
+
+ uint32 i = 0;
+
+ uint32 lq = 0;
+ uint32 mq = 0;
+ uint32 rq = 0;
+
+ for ( ;i<clrl; i++)
+ if (q[i] >= '0' + 20)
+ lq++;
+
+ for ( ;i<clrr; i++)
+ if (q[i] >= '0' + 20)
+ mq++;
+
+ for ( ; i<len; i++)
+ if (q[i] >= '0' + 20)
+ rq++;
+
+ delete Q;
+
+ if (isForward) {
+ totLQ += lq;
+ totMQ += mq;
+ totRQ += rq;
+ } else {
+ totLQ += rq;
+ totMQ += mq;
+ totRQ += lq;
+ }
+
+ fprintf(sFile, uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t%c\n",
+ iid, clrl, clrr, len, lq, mq, rq, type);
+}
+
+
+void
+analyze(sim4polish *p,
+ char type) {
+
+ uint32 clrl = p->_exons[0]._estFrom - 1;
+ uint32 clrr = p->_exons[0]._estTo - 1;
+
+ if (p->_matchOrientation == SIM4_MATCH_COMPLEMENT) {
+ clrl = p->_estLen - (p->_exons[0]._estTo - 1);
+ clrr = p->_estLen - (p->_exons[0]._estFrom - 1);
+ }
+
+ analyze(p->_estID, clrl, clrr, p->_estLen, p->_matchOrientation != SIM4_MATCH_COMPLEMENT, type);
+}
+
+
+
+void
+pickBestSlave(sim4polish **p, uint32 pNum) {
+ uint32 identitym = 0, nmatchesm = 0; // Best score for the mList
+ uint32 identityi = 0, nmatchesi = 0; // Best score the the iList
+ uint32 matchi = 0, matchm = 0;
+
+ // Difficult choice here....
+ //
+ if (pNum == 1) {
+ statOneMatch++;
+ statUnique++;
+
+ if (uFile)
+ p[0]->s4p_printPolish(uFile);
+
+ if (oFile)
+ p[0]->s4p_printPolish(oFile);
+
+ analyze(p[0], 'U');
+
+ return;
+ }
+
+ // Find the best percentIdentity and best numberOfMatches.
+ //
+ // identityi is the best percent identity of all the matches for this EST, and
+ // nmatchesi is the number of matches for the longest best identity match(es).
+ // matchi is the match index
+ //
+ // nmatchesm is the best numMatches of all the matches for this EST, and
+ // identitym is the highest percent identity for the best numMatches match(es).
+ // matchm is the match index
+
+ for (uint32 i=0; i<pNum; i++) {
+ if ((p[i]->_percentIdentity > identityi) ||
+ (p[i]->_percentIdentity == identityi && p[i]->_numMatches > nmatchesi)) {
+ identityi = p[i]->_percentIdentity;
+ nmatchesi = p[i]->_numMatches;
+ matchi = i;
+ }
+
+ if ((p[i]->_numMatches > nmatchesm) ||
+ (p[i]->_numMatches == nmatchesm && p[i]->_percentIdentity > identitym)) {
+ nmatchesm = p[i]->_numMatches;
+ identitym = p[i]->_percentIdentity;
+ matchm = i;
+ }
+ }
+
+
+ bool matchIsOK = false;
+
+ // If we are in agreement on what the best quality match is,
+ // see if the best match is obviously unique.
+ //
+ if ((identityi == identitym) ||
+ (nmatchesi == nmatchesm)) {
+ statConsistent++;
+
+ // It's clear what the quality values of the best match is, but we
+ // don't know if those values are shared by more than one match.
+ // Count the number of matches with exactly those scores. If
+ // there is more than one, then we cannot pick out a single best.
+ //
+ uint32 numBest = 0;
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) && (p[i]->_numMatches == nmatchesi))
+ numBest++;
+
+ if (numBest > 1) {
+
+ // Dang, we mapped this guy more than once, exactly the same!
+ //
+ consistentTie++;
+
+ } else {
+
+ // We claim to have a single best match. See if any other
+ // matches are close to the quality of that one.
+
+ uint32 closeQuality = 0;
+
+ for (uint32 i=0; i<pNum; i++)
+ if (((p[i]->_percentIdentity * 102) >= (identityi * 100)) ||
+ ((p[i]->_numMatches * 102) >= (nmatchesi * 100)))
+ closeQuality++;
+
+ // If only one match has close quality (the one we want to save!),
+ // save it. Otherwise, label this query as multiple.
+
+ uint32 length = p[matchi]->_exons[0]._estFrom - p[matchi]->_exons[0]._estTo;
+
+ if (closeQuality == 1) {
+ matchIsOK = true;
+ consistentMatches++;
+ } else if ((length > 100) &&
+ (length / p[matchi]->_estLen < 0.5)) {
+ consistentTooShort++;
+ } else {
+ consistentNot++;
+ }
+ }
+
+ } else {
+
+ // Otherwise, we disagree on what the best match is.
+ //
+ // That is, the match with the highest identity is not the match
+ // with the highest number of matches -- a longer match exists, but
+ // at lower overall percent identity.
+
+ statInconsistent++;
+
+ // Estimate the identity of the extended part, assuming the piece
+ // matched in common is matched at about the same identity. Or
+ // just give up and say it's mapped to multiple places!
+
+ }
+
+
+ uint32 best = 0;
+ uint32 besti = 0;
+
+ if (matchIsOK) {
+ statUnique++;
+ if (uFile)
+ p[matchi]->s4p_printPolish(uFile);
+
+ assert(matchi == matchm);
+
+ besti = matchi;
+ analyze(p[besti], 'G');
+ } else {
+ statLost++;
+
+ // Just pick the longest match, analyze that.
+
+ for (uint32 i=0; i<pNum; i++) {
+ uint32 len = p[i]->_exons[0]._estFrom - p[i]->_exons[0]._estTo;
+
+ if ((len > best) ||
+ ((len == best) && (p[i]->_numMatches > p[besti]->_numMatches))) {
+ best = len;
+ besti = i;
+ }
+ }
+
+ analyze(p[besti], 'N');
+ }
+
+
+#if 0
+ uint32 nm = (uint32)(p[besti]->_numMatches * 0.75);
+ uint32 sv = 0;
+
+ for (uint32 i=0; i<pNum; i++)
+ if (p[i]->_numMatches >= nm)
+ sv++;
+
+ fprintf(stderr, "Saved "uint32FMT" matches more than nmatches "uint32FMT" (from best of "uint32FMT")\n", sv, nm, p[besti]->_numMatches);
+#endif
+
+
+ // besti is the best/longest match we have. Decide on a threshold
+ // to throw out the obvious junk.
+ //
+ if ((oFile) && (doFiltering)) {
+ uint32 nm = (uint32)(p[besti]->_numMatches * filter);
+
+ for (uint32 i=0; i<pNum; i++)
+ if (p[i]->_numMatches >= nm)
+ p[i]->s4p_printPolish(oFile);
+ }
+
+#if 0
+ fprintf(stderr, "Uni:"uint32FMTW(8)" Con:"uint32FMTW(8)" (T:"uint32FMTW(8)" M:"uint32FMTW(8)" I:"uint32FMTW(8)" N:"uint32FMTW(8)") Inc:"uint32FMTW(8)" -- Save:"uint32FMTW(8)" Lost:"uint32FMTW(8)"\r",
+ statOneMatch,
+ statConsistent, consistentTie, consistentMatches, consistentIdentity, consistentNot,
+ statInconsistent,
+ statUnique, statLost);
+#endif
+}
+
+
+
+
+
+
+
+// Just a wrapper around the real best picker, so that we can easily
+// destroy polishes when we're done.
+//
+void
+pickBest(sim4polish **p, uint32 pNum) {
+
+ pickBestSlave(p, pNum);
+
+ for (uint32 i=0; i<pNum; i++)
+ delete p[i];
+}
+
+
+
+
+
+
+
+dict_t *IIDdict = 0L;
+dict_t *SEQdict = 0L;
+dict_t *GENdict = 0L;
+
+void
+fixIID(sim4polish *q, dict_t *estdict) {
+
+ // Fix the IID's
+ dnode_t *cid = dict_lookup(estdict, q->_estDefLine);
+ dnode_t *gid = dict_lookup(GENdict, q->_genDefLine);
+
+ if ((cid == 0L) || (gid == 0L)) {
+ const char *msg = "both deflines";
+ if (cid) msg = "genomic defline";
+ if (gid) msg = "est defline";
+
+ q->s4p_printPolish(stdout);
+ fprintf(stderr, "ERROR: Couldn't find %s (%p %p) in the dictionary!\n", msg, cid, gid);
+ exit(1);
+ }
+
+ q->_estID = (uint32)(unsigned long)dnode_get(cid);
+ q->_genID = (uint32)(unsigned long)dnode_get(gid);
+}
+
+
+
+
+
+
+
+
+
+
+
+//
+// Stolen from sortPolishes
+//
+int mergeFilesLen;
+int mergeFilesMax;
+FILE **mergeFiles;
+char **mergeNames;
+sim4polish **mergePolishes;
+
+sim4polish *
+nextPolish(void) {
+ int smallestPolish = 0;
+ int nextPolish = 1;
+
+ // If no merge files, read from stdin
+ //
+ if (mergeFilesLen == 0) {
+ return(new sim4polish(stdin));
+ }
+
+ // Find the smallest polish.
+ //
+ for (nextPolish = smallestPolish+1; nextPolish < mergeFilesLen; nextPolish++) {
+ if (s4p_estIDcompare(mergePolishes+smallestPolish, mergePolishes+nextPolish) > 0)
+ smallestPolish = nextPolish;
+ }
+
+ // If the smallestPolish is 0L, we're all done. Otherwise, dump
+ // the current smallest and fill it with a new polish.
+ //
+ if (mergePolishes[smallestPolish] == 0L) {
+ return(0L);
+ } else {
+ sim4polish *ret = mergePolishes[smallestPolish];
+ mergePolishes[smallestPolish] = new sim4polish(mergeFiles[smallestPolish]);
+
+ // fix the iid's to be consistent in our partition, so we can have the input files
+ // sorted by est iid.
+ if (mergePolishes[smallestPolish])
+ fixIID(mergePolishes[smallestPolish], IIDdict);
+
+ // fix the iid's to be consistent globally
+ fixIID(ret, SEQdict);
+
+ return(ret);
+ }
+}
+
+
+
+
+
+
+
+//
+// Stolen from fixPolishesIID
+//
+void
+addToDict(dict_t *d, char *n) {
+ dnode_t *node = 0L;
+ char *dcpy = 0L;
+
+ if (n == 0L)
+ return;
+
+ seqCache *F = new seqCache(n);
+ seqInCore *S = F->getSequenceInCore();
+
+ while (S) {
+ node = (dnode_t *)palloc(sizeof(dnode_t));
+ dcpy = (char *)palloc(sizeof(char) * S->headerLength() + 1);
+
+ strcpy(dcpy, S->header());
+
+ dnode_init(node, (void *)(unsigned long)S->getIID());
+ dict_insert(d, node, dcpy);
+
+ delete S;
+ S = F->getSequenceInCore();
+ }
+ delete F;
+}
+
+int
+headerCompare(const void *a, const void *b) {
+ char *A = *((char **)a);
+ char *B = *((char **)b);
+
+ //fprintf(stderr, "%s -- %s\n", A, B);
+ return(strcmp(A, B));
+}
+
+
+
+
+
+
+
+
+int
+main(int argc, char **argv) {
+ uint32 pNum = 0;
+ uint32 pAlloc = 8388608;
+ uint32 estID = ~uint32ZERO;
+
+ bool *found = 0L;
+
+ // From fixPolishesIID.c
+ IIDdict = 0L;
+ SEQdict = 0L;
+ GENdict = 0L;
+
+ // Incorporated from sortPolishes
+ mergeFilesLen = 0;
+ mergeFilesMax = sysconf(_SC_OPEN_MAX);
+ mergeFiles = new FILE * [mergeFilesMax];
+ mergeNames = new char * [mergeFilesMax];
+ mergePolishes = new sim4polish * [mergeFilesMax];
+
+ // Default to printing stats on stdout.
+ sFile = stdout;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-n") == 0) {
+ pAlloc = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-fpart") == 0) {
+ arg++;
+ fprintf(stderr, "reading query deflines from '%s'\n", argv[arg]);
+ IIDdict = dict_create(DICTCOUNT_T_MAX, headerCompare);
+ addToDict(IIDdict, argv[arg]);
+ } else if (strcmp(argv[arg], "-g") == 0) {
+ ++arg;
+ fprintf(stderr, "reading genomic deflines from '%s'\n", argv[arg]);
+ GENdict = dict_create(DICTCOUNT_T_MAX, headerCompare);
+ addToDict(GENdict, argv[arg]);
+ } else if (strcmp(argv[arg], "-F") == 0) {
+ ++arg;
+ fprintf(stderr, "reading query deflines from '%s'\n", argv[arg]);
+ SEQdict = dict_create(DICTCOUNT_T_MAX, headerCompare);
+ addToDict(SEQdict, argv[arg]);
+ } else if (strcmp(argv[arg], "-f") == 0) {
+ ++arg;
+ SEQ = new seqCache(argv[arg]);
+ } else if (strcmp(argv[arg], "-q") == 0) {
+ ++arg;
+ QLT = new seqCache(argv[arg]);
+
+ } else if (strcmp(argv[arg], "-filter") == 0) {
+ filter = atof(argv[++arg]);
+ doFiltering = true;
+ } else if (strcmp(argv[arg], "-output") == 0) {
+ char cmd[1024] = {0};
+ errno = 0;
+ ++arg;
+ if (strcmp(argv[arg] + strlen(argv[arg]) - 4, ".bz2") == 0) {
+ sprintf(cmd, "bzip2 -1c > %s", argv[arg]);
+ oFile = popen(cmd, "w");
+ oFileIsPipe = 1;
+ } else if (strcmp(argv[arg] + strlen(argv[arg]) - 3, ".gz") == 0) {
+ sprintf(cmd, "gzip -1c > %s", argv[arg]);
+ oFile = popen(cmd, "w");
+ oFileIsPipe = 1;
+ } else {
+ fprintf(stderr, "Got %s, not .bz2 not .gz!\n", argv[arg]);
+ exit(1);
+ }
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", cmd, strerror(errno));
+ doFiltering = true;
+ } else if (strcmp(argv[arg], "-scores") == 0) {
+ errno = 0;
+ sFile = fopen(argv[++arg], "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", argv[arg-1], strerror(errno));
+ doFiltering = true;
+ } else if (strcmp(argv[arg], "-unique") == 0) {
+ char cmd[1024] = {0};
+ errno = 0;
+ arg++;
+ if (strcmp(argv[arg] + strlen(argv[arg]) - 4, ".bz2") == 0)
+ sprintf(cmd, "bzip2 -1c > %s", argv[arg]);
+ else if (strcmp(argv[arg] + strlen(argv[arg]) - 3, ".gz") == 0)
+ sprintf(cmd, "gzip -1c > %s", argv[arg]);
+ else
+ sprintf(cmd, "cat > %s", argv[arg]);
+ uFile = popen(cmd, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s': %s\n", cmd, strerror(errno));
+ doFiltering = true;
+
+ } else if (strncmp(argv[arg], "-M", 2) == 0) {
+ arg++;
+ while ((arg < argc) && (fileExists(argv[arg]))) {
+ if (mergeFilesLen >= mergeFilesMax) {
+ fprintf(stderr, "%s: ERROR! Too many input files! Should be less than %d\n", argv[0], mergeFilesMax);
+ exit(1);
+ }
+ mergeNames[mergeFilesLen] = argv[arg];
+ mergeFiles[mergeFilesLen++] = openFile(argv[arg], "r");
+ arg++;
+ }
+ arg--;
+
+ } else {
+ fprintf(stderr, "unknown option: %s\n", argv[arg]);
+ }
+ arg++;
+ }
+
+
+ if (doFiltering) {
+ if (uFile == 0L)
+ fprintf(stderr, "ERROR: -unique is required\n"), exit(1);
+ if (sFile == 0L)
+ fprintf(stderr, "ERROR: -scores is required\n"), exit(1);
+ if ((filter < 0.0) || (filter > 1.0))
+ fprintf(stderr, "ERROR: -filter value of %f invalid. 0 <= F <= 100.\n", filter), exit(1);
+ }
+
+
+ if ((IIDdict == 0L) || (SEQdict == 0L) || (GENdict == 0L)) {
+ fprintf(stderr, "WARNING! No sequence dictionaries, NOT FIXING IIDs! (supply -fpart, -f and -g)\n");
+ }
+
+
+ if ((SEQ == 0L) || (QLT == 0L)) {
+ fprintf(stderr, "I need -f and -q\n");
+ exit(1);
+ }
+
+ // We no longer require that input polishes be sorted increasingly;
+ // now they only must be grouped. This remembers if we've seen a
+ // match or not. At the end, we'll analyze() those we haven't done
+ // already.
+ //
+ found = new bool [ SEQ->getNumberOfSequences() ];
+ for (uint32 i=0; i<SEQ->getNumberOfSequences(); i++)
+ found[i] = false;
+
+
+ // Initialize the merge -- if no merge files, nothing done!
+ //
+ for (int i=0; i<mergeFilesLen; i++) {
+ mergePolishes[i] = new sim4polish(mergeFiles[i]);
+ fixIID(mergePolishes[i], IIDdict);
+ }
+
+
+ // Read polishes, picking the best when we see a change in the
+ // estID.
+
+ sim4polish **p = new sim4polish * [pAlloc];
+ sim4polish *q;
+
+ while ((q = nextPolish()) != 0L) {
+
+ if ((q->_estID != estID) && (pNum > 0)) {
+ //fprintf(stderr, "PickBest for estID "uint32FMT"\n", estID);
+
+ found[estID] = true;
+ pickBest(p, pNum);
+ pNum = 0;
+ }
+
+ if (pNum >= pAlloc) {
+ sim4polish **P = new sim4polish * [pAlloc * 2];
+ memcpy(p, P, sizeof(sim4polish *) * pAlloc);
+ delete [] p;
+ p = P;
+ pAlloc *= 2;
+ }
+
+ p[pNum++] = q;
+ estID = q->_estID;
+ }
+
+ if (pNum > 0) {
+ found[estID] = true;
+ pickBest(p, pNum);
+ }
+
+ // Attempt cleanup
+ //
+ for (int i=0; i<mergeFilesLen; i++)
+ closeFile(mergeFiles[i], mergeNames[i]);
+
+ for (estID=0; estID < SEQ->getNumberOfSequences(); estID++)
+ if (found[estID] == false)
+ analyze(estID, 0, SEQ->getSequenceLength(estID), SEQ->getSequenceLength(estID), true, 'M');
+
+ delete [] mergeFiles;
+ delete [] mergeNames;
+ delete [] mergePolishes;
+
+ if (oFile) pclose(oFile);
+ if (uFile) pclose(uFile);
+ if (sFile) fclose(sFile);
+
+ fprintf(stderr, "Uni:"uint32FMTW(8)" Con:"uint32FMTW(8)" (T:"uint32FMTW(8)" M:"uint32FMTW(8)" I:"uint32FMTW(8)" S:"uint32FMTW(8)" N:"uint32FMTW(8)") Inc:"uint32FMTW(8)" -- Save:"uint32FMTW(8)" Lost:"uint32FMTW(8)"\n",
+ statOneMatch,
+ statConsistent, consistentTie, consistentMatches, consistentIdentity, consistentTooShort, consistentNot,
+ statInconsistent,
+ statUnique, statLost);
+ fprintf(stderr, "total: LQ:"uint32FMT" MQ:"uint32FMT" RQ:"uint32FMT"\n",
+ totLQ, totMQ, totRQ);
+
+ return(0);
+}
+
diff --git a/sim4dbutils/pickUniquePolish.C b/sim4dbutils/pickUniquePolish.C
new file mode 100644
index 0000000..6418726
--- /dev/null
+++ b/sim4dbutils/pickUniquePolish.C
@@ -0,0 +1,382 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "bio.h"
+#include "sim4.H"
+
+// Derived from pickBestPolish.c. We report only the single best
+// match, when it is obvious that there is EXACTLY one best match.
+//
+// Example: we have ten matches, but one is 3%id better than everyone
+// else -- that is an obviously unique match. The rest are noise.
+//
+// Example: ten matches, but they're all about the same quality -- within
+// a few percent id, and about the same length. We pick no match, and
+// silently discard all.
+//
+
+uint32 statOneMatch = 0;
+uint32 statConsistent = 0;
+uint32 statInconsistent = 0;
+uint32 statUnique = 0;
+uint32 statLost = 0;
+
+uint32 consistentTie = 0;
+uint32 consistentMatches = 0;
+uint32 consistentIdentity = 0;
+uint32 consistentTooShort = 0;
+uint32 consistentNot = 0;
+
+uint32 totLQ = 0;
+uint32 totMQ = 0;
+uint32 totRQ = 0;
+
+uint32 qualityDifference = 5;
+uint32 minQuality = 95;
+
+sim4polishWriter *W = 0L;
+
+
+
+void
+pickUniqueSlave(sim4polish **p, uint32 pNum) {
+ uint32 identitym = 0, nmatchesm = 0; // Best score for the mList
+ uint32 identityi = 0, nmatchesi = 0; // Best score the the iList
+ uint32 matchi = 0, matchm = 0;
+
+ // Difficult choice here....
+ //
+ if (pNum == 1) {
+ statOneMatch++;
+ statUnique++;
+ W->writeAlignment(p[0]);
+ return;
+ }
+
+ // Find the best percentIdentity and best numberOfMatches.
+ //
+ // identityi is the best percent identity of all the matches for this EST, and
+ // nmatchesi is the number of matches for the longest best identity match(es).
+ // matchi is the match index
+ //
+ // nmatchesm is the best numMatches of all the matches for this EST, and
+ // identitym is the highest percent identity for the best numMatches match(es).
+ // matchm is the match index
+
+ for (uint32 i=0; i<pNum; i++) {
+ if ((p[i]->_percentIdentity > identityi) ||
+ (p[i]->_percentIdentity == identityi && p[i]->_numMatches > nmatchesi)) {
+ identityi = p[i]->_percentIdentity;
+ nmatchesi = p[i]->_numMatches;
+ matchi = i;
+ }
+
+ if ((p[i]->_numMatches > nmatchesm) ||
+ (p[i]->_numMatches == nmatchesm && p[i]->_percentIdentity > identitym)) {
+ nmatchesm = p[i]->_numMatches;
+ identitym = p[i]->_percentIdentity;
+ matchm = i;
+ }
+ }
+
+ bool matchIsOK = false;
+
+ // If we are in agreement on what the best quality match is,
+ // see if the best match is obviously unique.
+ //
+ if ((identityi == identitym) ||
+ (nmatchesi == nmatchesm)) {
+ statConsistent++;
+
+ // It's clear what the quality values of the best match is, but we
+ // don't know if those values are shared by more than one match.
+ // Count the number of matches with exactly those scores. If
+ // there is more than one, then we cannot pick out a single best.
+ //
+ uint32 numBest = 0;
+ for (uint32 i=0; i<pNum; i++)
+ if ((p[i]->_percentIdentity == identityi) && (p[i]->_numMatches == nmatchesi))
+ numBest++;
+
+ if (numBest > 1) {
+
+ // Dang, we mapped this guy more than once, exactly the same!
+ //
+ consistentTie++;
+
+ } else {
+
+ // We claim to have a single best match. See if any other
+ // matches are close to the quality of that one.
+ //
+ // This says if (p[i]/ii >= 1.0 - Q), then we're close.
+
+ uint32 closeQuality = 0;
+ for (uint32 i=0; i<pNum; i++)
+ if (((p[i]->_percentIdentity * 100) >= (identityi * (100 - qualityDifference))) ||
+ ((p[i]->_numMatches * 100) >= (nmatchesi * (100 - qualityDifference))))
+ closeQuality++;
+
+ // If only one match has close quality (the one we want to save!),
+ // save it. Otherwise, label this query as multiple.
+
+ uint32 length = p[matchi]->_exons[0]._estFrom - p[matchi]->_exons[0]._estTo;
+
+ if (closeQuality == 1) {
+ matchIsOK = true;
+ consistentMatches++;
+ } else if ((length > 100) &&
+ (length / p[matchi]->_estLen < 0.5)) {
+ consistentTooShort++;
+ } else {
+ consistentNot++;
+ }
+ }
+
+ } else {
+
+ // Otherwise, we disagree on what the best match is.
+ //
+ // That is, the match with the highest identity is not the match
+ // with the highest number of matches -- a longer match exists, but
+ // at lower overall percent identity.
+
+ statInconsistent++;
+
+ // Estimate the identity of the extended part, assuming the piece
+ // matched in common is matched at about the same identity. Or
+ // just give up and say it's mapped to multiple places!
+
+ }
+
+
+ if (matchIsOK) {
+ statUnique++;
+ assert(matchi == matchm);
+ W->writeAlignment(p[matchi]);
+ } else {
+ statLost++;
+ }
+}
+
+
+
+
+
+// Delete all matches that are spanned, report everything else.
+// Matches that are close ties in span, but are clearly lower quality are deleted.
+//
+void
+pickCoveringSlave(sim4polish **p, uint32 pNum, char doCovering) {
+ uint32 *bgn = new uint32 [pNum];
+ uint32 *end = new uint32 [pNum];
+
+ for (uint32 i=0; i<pNum; i++) {
+ if (doCovering == 'q') {
+ if (p[i]->_matchOrientation == SIM4_MATCH_FORWARD) {
+ bgn[i] = p[i]->_exons[0]._estFrom - 1;
+ end[i] = p[i]->_exons[0]._estTo;
+ } else {
+ bgn[i] = p[i]->_estLen - p[i]->_exons[0]._estTo;
+ end[i] = p[i]->_estLen - p[i]->_exons[0]._estFrom + 1;
+ }
+ }
+
+ if (doCovering == 'g') {
+ bgn[i] = p[i]->_exons[0]._genFrom - 1;
+ end[i] = p[i]->_exons[0]._genTo;
+ }
+ }
+
+
+ for (uint32 i=0; i<pNum; i++) {
+ if (p[i] == NULL)
+ continue;
+
+ assert(p[i]->_numExons == 1);
+
+ for (uint32 j=i+1; j<pNum; j++) {
+ if (p[j] == NULL)
+ continue;
+
+ // i contained in j
+ // ----
+ // ---------
+ if ((bgn[j] <= bgn[i]) && (end[i] <= end[j])) {
+ delete p[i]; p[i] = NULL;
+ break; // This i is finished.
+ }
+
+ // j contained in i
+ // ---------
+ // ----
+ if ((bgn[i] <= bgn[j]) && (end[j] <= end[i])) {
+ delete p[j]; p[j] = NULL;
+ continue; // This j is finished.
+ }
+
+ // i almost contained in j
+ // ---- ----
+ // --------- OR ----------
+ if (((bgn[j] <= bgn[i] + 5) && (end[i] <= end[j])) ||
+ ((bgn[j] <= bgn[i]) && (end[i] <= end[j] + 5))) {
+ delete p[i]; p[i] = NULL;
+ break; // This i is finished.
+ }
+
+ // j almost contained in i
+ // --------- OR ----------
+ // ---- ----
+ if (((bgn[i] <= bgn[j] + 5) && (end[j] <= end[i])) ||
+ ((bgn[i] <= bgn[j]) && (end[j] <= end[i] + 5))) {
+ delete p[j]; p[j] = NULL;
+ continue; // This j is finished.
+ }
+ }
+ }
+
+ for (uint32 i=0; i<pNum; i++) {
+ if (p[i] == NULL)
+ continue;
+
+ W->writeAlignment(p[i]);
+ }
+
+ delete [] bgn;
+ delete [] end;
+}
+
+
+
+
+
+
+// Just a wrapper around the real best picker, so that we can easily
+// destroy polishes when we're done.
+//
+void
+pickUnique(sim4polish **p, uint32 pNum, char doCovering) {
+
+ if (doCovering != 0)
+ pickCoveringSlave(p, pNum, doCovering);
+ else
+ pickUniqueSlave(p, pNum);
+
+ for (uint32 i=0; i<pNum; i++)
+ delete p[i];
+}
+
+
+
+
+int
+main(int argc, char **argv) {
+ char doCovering = 0;
+ uint32 pNum = 0;
+ uint32 pAlloc = 1048576;
+ uint32 lastID = ~uint32ZERO;
+
+ sim4polishStyle style = sim4polishStyleDefault;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-cq") == 0) {
+ doCovering = 'q';
+
+ } else if (strcmp(argv[arg], "-cg") == 0) {
+ doCovering = 'g';
+
+ } else if (strcmp(argv[arg], "-q") == 0) {
+ qualityDifference = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-gff3") == 0) {
+ style = sim4polishGFF3;
+
+ } else {
+ fprintf(stderr, "unknown option: %s\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ if (isatty(fileno(stdin))) {
+ fprintf(stderr, "usage: %s [-q qualDiff] [-c] [-1] [-gff3] < file > file\n", argv[0]);
+ fprintf(stderr, " -q qualDiff Only report alignments where the best is qualDiff better\n");
+ fprintf(stderr, " in percent identity and coverage\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -cq Only report alignments that are not contained in some\n");
+ fprintf(stderr, " other alignment in the QUERY SEQUENCE.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -cg Only report alignments that are not contained in some\n");
+ fprintf(stderr, " other alignment in the GENOMIC SEQUENCE.\n");
+ fprintf(stderr, "\n");
+
+ if (isatty(fileno(stdin)))
+ fprintf(stderr, "error: I cannot read polishes from the terminal!\n\n");
+
+ exit(1);
+ }
+
+ // Read polishes, picking the best when we see a change in the
+ // estID.
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish **p = new sim4polish * [pAlloc];
+ sim4polish *q = 0L;
+
+ W = new sim4polishWriter("-", style);
+
+ if (R->getsim4polishStyle() != style)
+ fprintf(stderr, "warning: input format and output format differ.\n");
+
+ while (R->nextAlignment(q)) {
+ bool doPick = false;
+
+ if ((doCovering == 'q') && (q->_estID != lastID))
+ doPick = true;
+
+ if ((doCovering == 'g') && (q->_genID != lastID))
+ doPick = true;
+
+ if ((doCovering == 0) && (q->_estID != lastID))
+ doPick = true;
+
+ if ((doPick == true) && (pNum > 0)) {
+ pickUnique(p, pNum, doCovering);
+ pNum = 0;
+ }
+
+ if (pNum >= pAlloc) {
+ sim4polish **P = new sim4polish * [pAlloc * 2];
+ memcpy(p, P, sizeof(sim4polish *) * pAlloc);
+ delete [] p;
+ p = P;
+ pAlloc *= 2;
+ }
+
+ p[pNum++] = q;
+ lastID = (doCovering == 'g') ? q->_genID : q->_estID;
+
+ q = 0L; // Otherwise we delete the alignment we just saved!
+ }
+
+ if (pNum > 0)
+ pickUnique(p, pNum, doCovering);
+
+#if 0
+ fprintf(stderr, "Uni:"uint32FMTW(8)" Con:"uint32FMTW(8)" (T:"uint32FMTW(8)" M:"uint32FMTW(8)" I:"uint32FMTW(8)" N:"uint32FMTW(8)") Inc:"uint32FMTW(8)" -- Save:"uint32FMTW(8)" Lost:"uint32FMTW(8)"\n",
+ statOneMatch,
+ statConsistent, consistentTie, consistentMatches, consistentIdentity, consistentNot,
+ statInconsistent,
+ statUnique, statLost);
+#endif
+
+ delete [] p;
+
+ delete R;
+ delete W;
+
+ return(0);
+}
+
diff --git a/sim4dbutils/plotCoverageVsIdentity.C b/sim4dbutils/plotCoverageVsIdentity.C
new file mode 100644
index 0000000..846c515
--- /dev/null
+++ b/sim4dbutils/plotCoverageVsIdentity.C
@@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "sim4.H"
+
+int
+main(int argc, char ** argv) {
+ int c[101] = {0};
+ int i[101] = {0};
+
+ if (isatty(fileno(stdin))) {
+ fprintf(stderr, "creates three files:\n");
+ fprintf(stderr, " coverage.histogram\n");
+ fprintf(stderr, " identity.histogram\n");
+ fprintf(stderr, " c-vs-i.scatter\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "error: I cannot read polishes from the terminal!\n\n");
+ exit(1);
+ }
+
+ FILE *C = fopen("coverage.histogram", "w");
+ FILE *I = fopen("identity.histogram", "w");
+ FILE *S = fopen("c-vs-i.scatter", "w");
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ while (R->nextAlignment(p)) {
+ fprintf(S, uint32FMT" "uint32FMT"\n", p->_percentIdentity, p->_querySeqIdentity);
+
+ i[p->_percentIdentity]++;
+ c[p->_querySeqIdentity]++;
+ }
+
+ for (int x=0; x<101; x++) {
+ fprintf(C, "%d\n", c[x]);
+ fprintf(I, "%d\n", i[x]);
+ }
+
+ fclose(C);
+ fclose(I);
+ fclose(S);
+
+ return(0);
+}
diff --git a/sim4dbutils/plotIntronSize.C b/sim4dbutils/plotIntronSize.C
new file mode 100644
index 0000000..481516e
--- /dev/null
+++ b/sim4dbutils/plotIntronSize.C
@@ -0,0 +1,104 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "sim4reader.h"
+
+//
+// Outputs some statistics on the matches
+//
+
+#define HISTBIN (1000)
+#define HISTMAX (300000000 / HISTBIN)
+
+int
+main(int argc, char ** argv) {
+ uint32 dumpSize = 0;
+ uint32 *hist;
+ FILE *all;
+ FILE *big;
+ int i, j;
+
+ if (isatty(fileno(stdin))) {
+ fprintf(stderr, "error: I cannot read polishes from the terminal!\n\n");
+ exit(1);
+ }
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-dump", 2) == 0) {
+ dumpSize = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-all", 2) == 0) {
+ all = fopen(argv[++arg], "w");
+ if (all == 0L) {
+ fprintf(stderr, "Can't open '%s' for writing\n", argv[arg]);
+ exit(1);
+ }
+ } else if (strncmp(argv[arg], "-big", 2) == 0) {
+ big = fopen(argv[++arg], "w");
+ if (big == 0L) {
+ fprintf(stderr, "Can't open '%s' for writing\n", argv[arg]);
+ exit(1);
+ }
+ } else {
+ fprintf(stderr, "Unknown option: '%s'\n", argv[arg]);
+ }
+
+ arg++;
+ }
+
+ if (all || big) {
+ hist = new uint32 [HISTMAX];
+ memset(hist, 0, sizeof(uint32) * HISTMAX);
+ }
+
+ sim4polish *p = new sim4polish(stdin);
+ while (p->_numExons > 0) {
+ if (p->numExons > 1) {
+ int exA;
+ int exB;
+ int biggestIntron = 0;
+
+ for (exA=0, exB=1; exB < p->numExons; exA++, exB++) {
+ int dist = p->exons[exB].genFrom - p->exons[exA].genTo + 1;
+ if (dist > biggestIntron)
+ biggestIntron = dist;
+ if (all)
+ hist[dist / HISTBIN]++;
+ }
+
+ if (big)
+ hist[biggestIntron / HISTBIN]++;
+
+ //fprintf(stdout, "%d\n", biggestIntron);
+
+ if ((dumpSize > 0) && (biggestIntron > dumpSize))
+ printPolish(stdout, p);
+ }
+
+ destroyPolish(p);
+ }
+
+
+ if (all) {
+ for (j=HISTMAX-1; hist[j]==0 && j>=0; j--)
+ ;
+ for (i=0; i<j; i++)
+ fprintf(all, "%d\n", hist[i]);
+ fclose(all);
+ }
+
+ if (big) {
+ for (j=HISTMAX-1; hist[j]==0 && j>=0; j--)
+ ;
+ for (i=0; i<j; i++)
+ fprintf(big, "%d\n", hist[i]);
+ fclose(big);
+ }
+
+ delete [] hist;
+
+ return(0);
+}
diff --git a/sim4dbutils/realignPolishes.C b/sim4dbutils/realignPolishes.C
new file mode 100644
index 0000000..63e2a9d
--- /dev/null
+++ b/sim4dbutils/realignPolishes.C
@@ -0,0 +1,264 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include "bio++.H"
+#include "seqCache.H"
+#include "sim4.H"
+
+// This code takes basic sim4db format polishes and recomputes the
+// alignments and scores. Required in the input polishes are the EST
+// id, genomic id, exon coordinates and an orientation.
+
+int
+main(int argc, char **argv) {
+
+ // Load all the sequences. We really do need all the ESTs in core,
+ // since they probably aren't in a useful sorted order. You can
+ // probably figure out a way to get rid of the seqCache for the
+ // GEN. Doing so will reduce memory usage by about 50%.
+
+ seqCache *EST = 0L;
+ seqCache *GEN = 0L;
+ int mergeTolerancePerc = 0;
+ int mergeToleranceBase = 0;
+ int statsOnly = 0;
+ int warnOnChange = 0;
+
+ // Statistics on the exon merge
+
+ int mergedExons = 0;
+ int mergedMatches = 0;
+
+ int numcdnagaps = 0;
+ int nummatcheswithgaps = 0;
+
+ FILE *mergeLog = 0L;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-merge", 2) == 0) {
+ mergeTolerancePerc = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-b", 2) == 0) {
+ mergeToleranceBase = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-M", 2) == 0) {
+ mergeLog = fopen(argv[++arg], "w");
+ } else if (strncmp(argv[arg], "-e", 2) == 0) {
+ if (statsOnly) {
+ EST = new seqCache(argv[++arg], 1000, false); // debugging only!
+ } else {
+ EST = new seqCache(argv[++arg], 0, false);
+ EST->loadAllSequences();
+ }
+ } else if (strncmp(argv[arg], "-g", 2) == 0) {
+ GEN = new seqCache(argv[++arg], 0, false);
+ GEN->loadAllSequences();
+ } else if (strncmp(argv[arg], "-q", 2) == 0) {
+ statsOnly = 1;
+ } else if (strncmp(argv[arg], "-w", 2) == 0) {
+ warnOnChange = 1;
+ }
+ arg++;
+ }
+
+ if ((statsOnly == 0) && (!EST || !GEN)) {
+ fprintf(stderr, "usage: %s [-merge percent-tolerance] [-M merge-log] [-q] -e est.fasta -g genome.fasta < polishes > somewhere\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Polishes _MUST_ be sorted by genomic index.\n");
+ fprintf(stderr, " If not, performance will be worse than atrocious.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " percent-tolerance -- merge exons separated by gap if\n");
+ fprintf(stderr, " the cDNA and genomic gaps differ by less than p percent.\n");
+ fprintf(stderr, " A value of 5 means 5%%\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -q: Don't actually do the work, just count the statistics\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "\n");
+ exit(1);
+ }
+
+
+ char *s1 = new char [16 * 1024 * 1024];
+ char *s2 = new char [16 * 1024 * 1024];
+ int l1 = 0;
+ int l2 = 0;
+
+ speedCounter *C = new speedCounter("%12.0f polishes -- %12.0f polishes/second\r",
+ 1.0, 0xff, true);
+
+ sim4polishWriter *W = new sim4polishWriter("-", sim4polishS4DB);
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ while (R->nextAlignment(p)) {
+
+ //fprintf(stdout, "BEFORE\n");
+ //p->s4p_printPolish(stdout);
+
+ // If we have a mergeTolerance, merge adjacent exons that are
+ // separated my approximately equal sized cDNA and genomic gaps.
+ //
+ // Possible a better way to do this is to check if the identity
+ // of the missing region is decent, too.
+
+ // Remember the id/cv of this guy for the log
+ //
+ double id = 0.0;
+ double cv = 0.0;
+ if (mergeLog) {
+ id = p->s4p_percentIdentityExact();
+ cv = p->s4p_percentCoverageExact();
+ }
+
+ int merged = 0;
+ int gapped = 0;
+
+ if ((mergeTolerancePerc > 0) || (mergeToleranceBase > 0)) {
+ for (uint32 i=1; i<p->_numExons; i++) {
+ int cgap = p->_exons[i]._estFrom - p->_exons[i-1]._estTo;
+ int ggap = p->_exons[i]._genFrom - p->_exons[i-1]._genTo;
+
+ bool mergeGap = false;
+
+ // New method -- check if the gaps are within 20bp of each other
+ //
+ int diff = cgap - ggap;
+ if (diff < 0)
+ diff = -diff;
+
+ if (diff < mergeToleranceBase)
+ mergeGap = true;
+
+
+ // Original method -- cehck if the gaps are within 10% of each other
+ //
+ int ctol = cgap * (100 + mergeTolerancePerc);
+ int gtol = ggap * (100 + mergeTolerancePerc);
+
+ cgap *= 100;
+ ggap *= 100;
+
+ if (((cgap < ggap) && (ctol > ggap)) ||
+ ((ggap < cgap) && (gtol > cgap)))
+ mergeGap = true;
+
+ if (cgap > 1) {
+ numcdnagaps++;
+ gapped++;
+ }
+
+ if ((cgap > 1) && (mergeGap)) {
+
+ // Merge i and i-1 if adding in the tolerance makes either
+ // the cgap or the ggap longer than the other gap. i.e., the
+ // cgap was shorter, but including the tolerance makes it
+ // longer, so they're about the same size.
+
+ if (mergeLog)
+ fprintf(mergeLog,
+ "MERGE: "uint32FMTW(4)"-"uint32FMTW(4)" (%6.2f,%6.2f) "uint32FMTW(4)"-"uint32FMTW(4)
+ " and "uint32FMTW(8)"-"uint32FMTW(8)" (%6.2f,%6.2f) "uint32FMTW(8)"-"uint32FMTW(8)"\n",
+ p->_exons[i-1]._estFrom, p->_exons[i-1]._estTo,
+ cgap / 100.0, ctol / 100.0,
+ p->_exons[i]._estFrom, p->_exons[i]._estTo,
+ p->_exons[i-1]._genFrom, p->_exons[i-1]._genTo,
+ ggap / 100.0, gtol / 100.0,
+ p->_exons[i]._genFrom, p->_exons[i]._genTo);
+
+ // merge exons
+ p->_exons[i-1]._estTo = p->_exons[i]._estTo;
+ p->_exons[i-1]._genTo = p->_exons[i]._genTo;
+
+ // delete this exon
+ p->s4p_deleteExon(i);
+
+ // Do it again!
+ i--;
+
+ merged++;
+ mergedExons++;
+ }
+ }
+
+ if (merged)
+ mergedMatches++;
+ if (gapped)
+ nummatcheswithgaps++;
+ }
+
+
+ // For each exon, generate an alignment
+
+
+ if (statsOnly == 0) {
+ p->_estLen = EST->getSequenceInCore(p->_estID)->sequenceLength();
+ p->_estPolyA = 0;
+ p->_estPolyT = 0;
+
+ for (uint32 i=0; i<p->_numExons; i++) {
+ l1 = p->_exons[i]._estTo - p->_exons[i]._estFrom + 1;
+ l2 = p->_exons[i]._genTo - p->_exons[i]._genFrom + 1;
+
+ strncpy(s1, EST->getSequenceInCore(p->_estID)->sequence() + p->_exons[i]._estFrom - 1, l1);
+ strncpy(s2, GEN->getSequenceInCore(p->_genID)->sequence() + p->_exons[i]._genFrom - 1, l2);
+
+ if (p->_matchOrientation == SIM4_MATCH_COMPLEMENT) {
+ strncpy(s1, EST->getSequenceInCore(p->_estID)->sequence() + p->_estLen - p->_exons[i]._estTo, l1);
+ reverseComplementSequence(s1, l1);
+ }
+
+ s1[l1] = 0;
+ s2[l2] = 0;
+
+ delete [] p->_exons[i]._estAlignment;
+ delete [] p->_exons[i]._genAlignment;
+
+ p->_exons[i]._estAlignment = new char [l1+l2+1];
+ p->_exons[i]._genAlignment = new char [l1+l2+1];
+
+ halign(s1, s2,
+ l1, l2,
+ p->_exons[i]._estAlignment,
+ p->_exons[i]._genAlignment);
+ }
+
+ // There isn't an intron after the last exon. Force it.
+ //
+ p->_exons[p->_numExons-1]._intronOrientation = SIM4_INTRON_NONE;
+
+ // Check that we didn't radically change things
+ uint32 nm = p->_numMatches;
+
+ p->s4p_updateAlignmentScores();
+
+ W->writeAlignment(p);
+
+ if (warnOnChange) {
+ uint32 diff = 0;
+ if (nm < p->_numMatches) diff = p->_numMatches - nm;
+ if (nm > p->_numMatches) diff = nm - p->_numMatches;
+
+ if (diff > p->_numMatches / 100)
+ fprintf(stdout, "WARNING: CHANGED! "uint32FMT" -> "uint32FMT"\n", nm, p->_numMatches);
+ }
+ }
+
+ if (merged) {
+ fprintf(mergeLog, "MERGED\tEST\t"uint32FMT"\tfrom\t%8.3f\t%8.3f\tto\t%8.3f\t%8.3f\n",
+ p->_estID, id, cv, p->s4p_percentIdentityExact(), p->s4p_percentCoverageExact());
+ }
+
+ C->tick();
+ }
+
+ if ((mergeTolerancePerc > 0) || (mergeToleranceBase > 0)) {
+ fprintf(stderr, "FOUND: %d gaps in %d matches.\n", numcdnagaps, nummatcheswithgaps);
+ fprintf(stderr, "MERGED: %d gaps in %d matches.\n", mergedExons, mergedMatches);
+ }
+
+ delete GEN;
+ delete EST;
+
+ return(0);
+}
diff --git a/sim4dbutils/removeDuplicate.C b/sim4dbutils/removeDuplicate.C
new file mode 100644
index 0000000..1f4e797
--- /dev/null
+++ b/sim4dbutils/removeDuplicate.C
@@ -0,0 +1,142 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "sim4.H"
+
+// Input matches should be sorted by cDNA, and ran through pickBest.
+// This code will remove all matches that have the same genomic span,
+// and warn when two matches have nearly the same genomic span.
+
+sim4polishWriter *W = 0L;
+
+void
+pickBest(sim4polish **p, int pNum) {
+ int i, j;
+
+ for (i=0; i<pNum; i++) {
+ for (j=i+1; j<pNum; j++) {
+ if ((p[i]) &&
+ (p[j]) &&
+ (p[i]->_numExons == p[j]->_numExons) &&
+ (p[i]->_genID == p[j]->_genID)) {
+ int a, b;
+ int sd = 666;
+ int ed = 666;
+
+ a = p[i]->_exons[0]._genFrom;
+ b = p[j]->_exons[0]._genFrom;
+ if (a < b)
+ sd = b - a;
+ else
+ sd = a - b;
+
+ a = p[i]->_exons[p[i]->_numExons-1]._genTo;
+ b = p[j]->_exons[p[j]->_numExons-1]._genTo;
+ if (a < b)
+ ed = b - a;
+ else
+ ed = a - b;
+
+ if ((sd == 0) && (ed == 0)) {
+ //fprintf(stderr, "%d and %d are exact; %d removed.\n", i, j, j);
+ delete p[j];
+ p[j] = 0L;
+ } else if ((sd < 10) && (ed < 10)) {
+ char *alignI = p[i]->s4p_polishToString(sim4polishS4DB);
+ char *alignJ = p[j]->s4p_polishToString(sim4polishS4DB);
+
+ fprintf(stderr, "----------------------------------------\n");
+ fprintf(stderr, "Warning: %d and %d are similar.\n", i, j);
+ fprintf(stderr, "%s\n", alignI);
+ fprintf(stderr, "%s\n", alignJ);
+ fprintf(stderr, "----------------------------------------\n");
+
+ delete [] alignI;
+ delete [] alignJ;
+ }
+ }
+ }
+ }
+
+ for (i=0; i<pNum; i++) {
+ if (p[i]) {
+ W->writeAlignment(p[i]);
+ delete p[i];
+ }
+ }
+}
+
+int
+main(int argc, char **argv) {
+ uint32 pNum = 0;
+ uint32 pAlloc = 8388608;
+ uint32 estID = ~uint32ZERO;
+
+ sim4polishStyle style = sim4polishStyleDefault;
+
+ int arg = 1;
+
+ while (arg < argc) {
+ if (strcmp(argv[1], "-gff3") == 0)
+ style = sim4polishGFF3;
+ else
+ fprintf(stderr, "usage: %s [-gff3] < file > file\n", argv[0]);
+
+ arg++;
+ }
+
+
+ if (isatty(fileno(stdin))) {
+ fprintf(stderr, "usage: %s [-gff3] < file > file\n", argv[0]);
+
+ if (isatty(fileno(stdin)))
+ fprintf(stderr, "error: I cannot read polishes from the terminal!\n\n");
+
+ exit(1);
+ }
+
+ // Read polishes, picking the best when we see a change in
+ // the estID.
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish **p = new sim4polish * [pAlloc];
+ sim4polish *q = 0L;
+
+ W = new sim4polishWriter("-", style);
+
+ if (R->getsim4polishStyle() != style)
+ fprintf(stderr, "warning: input format and output format differ.\n");
+
+ while (R->nextAlignment(q)) {
+ if ((q->_estID != estID) && (pNum > 0)) {
+ pickBest(p, pNum);
+ pNum = 0;
+ }
+
+ if (pNum >= pAlloc) {
+ sim4polish **P = new sim4polish * [pAlloc * 2];
+ memcpy(p, P, sizeof(sim4polish *) * pAlloc);
+ delete [] p;
+ p = P;
+ pAlloc *= 2;
+ }
+
+ p[pNum++] = q;
+ estID = q->_estID;
+
+ q = 0L; // Else we will delete the polish we just saved!
+ }
+
+ if (pNum > 0)
+ pickBest(p, pNum);
+
+ delete [] p;
+
+ delete R;
+ delete W;
+
+ return(0);
+}
+
diff --git a/sim4dbutils/removeRedundant.C b/sim4dbutils/removeRedundant.C
new file mode 100644
index 0000000..2085007
--- /dev/null
+++ b/sim4dbutils/removeRedundant.C
@@ -0,0 +1,265 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "bio++.H"
+#include "sim4.H"
+#include "s4p_overlap.H"
+
+// Remove redundant polishes from an input set.
+//
+// Redundancy is defined as two polishes that overlap on the genome.
+// Any amount of overlap is redundant.
+//
+// The longest of the overlapping matches is saved.
+
+//#define DEBUGOUT
+
+int
+main(int argc, char **argv) {
+
+ if (argc < 2) {
+ fprintf(stderr, "usage: %s [-gff3] <polishes-file>\n", argv[0]);
+ fprintf(stderr, "(yes, you _must_ give it a file. stdin is not possible.)\n");
+ fprintf(stderr, "WARNING THIS IS PROTOTYPE BROKEN CODE!\n");
+ exit(1);
+ }
+
+ sim4polishStyle wstyle = sim4polishStyleDefault;
+ sim4polishStyle rstyle = sim4polishStyleDefault;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-gff3") == 0) {
+ wstyle = sim4polishGFF3;
+ }
+
+ arg++;
+ }
+
+
+ uint32 matchesWithNoOverlap = 0;
+ uint32 matchesWithOverlap = 0;
+ uint32 notPerfectClique = 0;
+
+ // Open a polishFile and force the index to build
+ // First find the input file type, with a hack
+
+ sim4polishReader *reader = new sim4polishReader(argv[argc-1]);
+ rstyle = reader->getsim4polishStyle();
+ delete reader;
+
+ sim4polishFile *Afile = new sim4polishFile(argv[argc-1], rstyle);
+ Afile->setPosition(0);
+
+ sim4polishWriter *writer = new sim4polishWriter("-", wstyle);
+
+ if (rstyle != wstyle)
+ fprintf(stderr, "warning: input format and output format differ.\n");
+
+
+ // Ask both for the largest EST iid seen, then iterate over those.
+ //
+ uint32 largestIID = Afile->maxIID();
+
+ for (uint32 iid=0; iid<largestIID; iid++) {
+ sim4polishList *A = Afile->getEST(iid);
+
+ if (A->length() > 0) {
+
+ // fill out the overlap matrix
+
+ olap_t **overlap = new olap_t* [A->length()];
+ overlap[0] = new olap_t [A->length() * A->length()];
+ for (uint32 i=1; i<A->length(); i++)
+ overlap[i] = overlap[i-1] + A->length();
+
+ for (uint32 a=0; a<A->length(); a++)
+ for (uint32 b=0; b<A->length(); b++)
+ if (a == b)
+ overlap[a][b] = 0;
+ else
+ overlap[a][b] = findOverlap((*A)[a], (*A)[b]);
+
+ // look for guys with no overlaps, print and remove them
+
+ sim4polishList *W = new sim4polishList;
+
+ for (uint32 a=0; a<A->length(); a++) {
+ bool nooverlaps = true;
+
+ for (uint32 b=0; b<A->length(); b++)
+ if (overlap[a][b])
+ nooverlaps = false;
+
+ if (nooverlaps) {
+ matchesWithNoOverlap++;
+
+ writer->writeAlignment((*A)[a]);
+ } else {
+ matchesWithOverlap++;
+ W->push(new sim4polish((*A)[a]));
+ }
+ }
+
+
+#if 1
+ fprintf(stderr, "IID="uint32FMTW(8)" -- overlap:"uint32FMT" noOverlap:"uint32FMT"\r",
+ iid, matchesWithOverlap, matchesWithNoOverlap);
+ fflush(stderr);
+#endif
+
+
+ // A is junk, W contains the matches that overlap.
+
+ delete A;
+ A = 0L;
+
+
+ // Report all the overlaps
+
+#ifdef DEBUGOUT
+ for (uint32 a=0; a<W->length(); a++) {
+ sim4polish *p = (*W)[a];
+ fprintf(stderr, uint32FMTW(3)": "uint32FMTW(3)"--"uint32FMTW(3)"\n",
+ iid, p->exons[0].genFrom, p->exons[p->numExons-1].genTo);
+ }
+#endif
+
+
+
+ // while we have matches in the set of overlapping matches,
+ // find a connected component, check that it is/is not a
+ // clique, and decide which match to keep.
+
+ uint32 *clique = new uint32 [W->length()];
+ uint32 cliqueSize = 0;
+ bool inserted = false;
+ uint32 *length = new uint32 [W->length()];
+
+ while (W->length() > 0) {
+
+#ifdef DEBUGOUT
+ fprintf(stderr, "IID="uint32FMTW(8)" -- examine "uint32FMT" matches\n",
+ iid, W->length());
+#endif
+
+ // Find the length of all the matches in this set
+
+ for (uint32 a=0; a<W->length(); a++) {
+ length[a] = 0;
+ for (uint32 i=0; i<(*W)[a]->_numExons; i++)
+ length[a] += (*W)[a]->_exons[i]._genTo - (*W)[a]->_exons[i]._genFrom + 1;
+ }
+
+ // reconstruct the overlap matrix -- hey, if you want to be
+ // efficient and recover this from the existing one, nobody is
+ // stopping you.
+
+ for (uint32 a=0; a<W->length(); a++)
+ for (uint32 b=0; b<W->length(); b++)
+ if (a == b)
+ overlap[a][b] = 0;
+ else
+ overlap[a][b] = findOverlap((*W)[a], (*W)[b]);
+
+ // OK, now find the clique/connected component
+
+ for (uint32 i=0; i<W->length(); i++)
+ clique[i] = 0;
+
+ clique[0] = 1;
+ cliqueSize = 1;
+ inserted = true;
+
+ while (inserted) {
+ inserted = false;
+
+ // If a is in the clique, add all it's overlaps
+
+ for (uint32 a=0; a<W->length(); a++) {
+ if (clique[a]) {
+ for (uint32 b=0; b<W->length(); b++) {
+ if ((overlap[a][b]) && (!clique[b])) {
+ clique[b] = 1;
+ cliqueSize++;
+ inserted = true;
+ }
+ }
+ }
+ }
+ }
+
+#ifdef DEBUGOUT
+ fprintf(stderr, "IID="uint32FMTW(8)" -- examine "uint32FMT" matches, found "uint32FMT" overlapping\n",
+ iid, W->length(), cliqueSize);
+#endif
+
+ // Check that it is a clique
+
+ if (cliqueSize > 2) {
+
+ uint32 num = 0;
+
+ for (uint32 a=0; a<W->length(); a++)
+ for (uint32 b=0; b<W->length(); b++)
+ if (clique[a] && clique[b] && overlap[a][b])
+ num++;
+
+ if (num != cliqueSize * (cliqueSize-1)) {
+ notPerfectClique++;
+
+ fprintf(stderr, "\nNOT A PERFECT CLIQUE! Found "uint32FMT" overlaps, wanted "uint32FMT" in the clique.\n",
+ num, cliqueSize * (cliqueSize-1));
+
+ //for (uint32 a=0; a<W->length(); a++)
+ // if (clique[a])
+ // writer->writeAlignment((*W)[a]);
+ }
+
+ }
+
+ // Find the longest member, output it
+
+ uint32 longest = 0;
+ while (clique[longest] == 0)
+ longest++;
+
+ for (uint32 i=0; i<W->length(); i++)
+ if ((clique[i]) && (length[longest] < length[i]))
+ longest = i;
+
+ writer->writeAlignment((*W)[longest]);
+
+ // Remove the clique from the set of overlaps
+
+ A = new sim4polishList;
+ for (uint32 i=0; i<W->length(); i++) {
+ if (clique[i] == 0)
+ A->push(new sim4polish((*W)[i]));
+ }
+
+ delete W;
+ W = A;
+ A = 0L;
+ }
+
+ delete [] clique;
+ delete W;
+
+ delete [] overlap[0];
+ delete [] overlap;
+ }
+
+ delete A;
+ }
+
+ delete writer;
+ delete Afile;
+
+ fprintf(stderr, "\nmatches withOvl:"uint32FMT" withoutOvl:"uint32FMT"\n",
+ matchesWithOverlap, matchesWithNoOverlap);
+ fprintf(stderr, "not perfect clique:"uint32FMT"\n", notPerfectClique);
+}
+
+
diff --git a/sim4dbutils/reportAlignmentDifferences.C b/sim4dbutils/reportAlignmentDifferences.C
new file mode 100644
index 0000000..946ee3c
--- /dev/null
+++ b/sim4dbutils/reportAlignmentDifferences.C
@@ -0,0 +1,205 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "sim4.H"
+#include "util++.H"
+
+int
+main(int argc, char **argv) {
+ char *outPrefix = 0L;
+ char datName[FILENAME_MAX];
+ char gnuName[FILENAME_MAX];
+ char pngName[FILENAME_MAX];
+ char gnuCmd[FILENAME_MAX];
+ char *inName = 0L;
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-o", 2) == 0) {
+ outPrefix = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-i", 2) == 0) {
+ inName = argv[++arg];
+
+ } else {
+ fprintf(stderr, "Unknown arg '%s'\n", argv[arg]);
+ err++;
+ }
+ arg++;
+ }
+ if ((inName == 0L) || (outPrefix == 0L) || (err != 0)) {
+ fprintf(stderr, "usage: %s -i sim4db -o outputPrefix\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Creates outputPrefix.dat containing the number of errors at each\n");
+ fprintf(stderr, " base position, and outputPrefix.png the visual representation.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Suggested usage:\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " snapper2\n");
+ fprintf(stderr, " -queries Q.fasta\n");
+ fprintf(stderr, " -genomic G.fasta\n");
+ fprintf(stderr, " -positions G.posDB\n");
+ fprintf(stderr, " -aligns\n");
+ fprintf(stderr, " -minmatchidentity 94\n");
+ fprintf(stderr, " -minmatchcoverage 90\n");
+ fprintf(stderr, " -mersize 18\n");
+ fprintf(stderr, " -ignore 500\n");
+ fprintf(stderr, " -numthreads 16\n");
+ fprintf(stderr, " -verbose\n");
+ fprintf(stderr, " -output Q.sim4db\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " pickBestPolish < Q.sim4db > Q.best.sim4db\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " reportAlignmentDifferences\n");
+ fprintf(stderr, " -i Q.best.sim4db\n");
+ fprintf(stderr, " -o Q\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "\n");
+ exit(1);
+ }
+
+ fprintf(stderr, "Reading input from '%s'\n", inName);
+ fprintf(stderr, "Writing output to '%s'\n", outPrefix);
+
+ // Open output files early, in case they fail.
+
+ errno = 0;
+
+ sprintf(datName, "%s.dat", outPrefix);
+ sprintf(gnuName, "%s.gnuplot", outPrefix);
+ sprintf(pngName, "%s.png", outPrefix);
+
+ FILE *DAT = fopen(datName, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s' for writing data: %s\n", datName, strerror(errno)), exit(1);
+
+ FILE *GNU = fopen(gnuName, "w");
+ if (errno)
+ fprintf(stderr, "Failed to open '%s' for writing gnuplot command: %s\n", gnuName, strerror(errno)), exit(1);
+
+ // Read matches.
+
+ uint32 lMax = 10240;
+ uint32 lLen = 0;
+
+ uint32 *nTot = new uint32 [lMax];
+ uint32 *nIde = new uint32 [lMax];
+ uint32 *nMis = new uint32 [lMax];
+ uint32 *nIns = new uint32 [lMax];
+ uint32 *nDel = new uint32 [lMax];
+
+ memset(nTot, 0, sizeof(uint32) * lMax);
+ memset(nIde, 0, sizeof(uint32) * lMax);
+ memset(nMis, 0, sizeof(uint32) * lMax);
+ memset(nIns, 0, sizeof(uint32) * lMax);
+ memset(nDel, 0, sizeof(uint32) * lMax);
+
+ sim4polishReader *R = new sim4polishReader(inName);
+ sim4polish *p = 0L;
+
+ while (R->nextAlignment(p)) {
+ bool fwd = (p->_matchOrientation == SIM4_MATCH_FORWARD);
+
+ for (uint32 exon=0; exon<p->_numExons; exon++) {
+ sim4polishExon *e = p->_exons + exon;
+
+ // Fail if there are no alignments.
+
+ if ((e->_estAlignment == 0L) ||
+ (e->_genAlignment == 0L))
+ fprintf(stderr, "FAIL: Input has no alignment strings (-aligns option in snapper2).\n"), exit(1);
+
+ // Parse the alignment to find ungapped blocks
+
+ uint32 aPos = 0; // Position in the alignment
+ uint32 qPos = e->_estFrom - 1; // Actual position in the query sequence
+ uint32 gPos = e->_genFrom - 1; // Actual position in the genome sequence
+
+ if (fwd == false)
+ qPos = p->_estLen - e->_estFrom + 1;
+
+
+ bool notDone = true; // There should be a way to get rid of this stupid variable....
+ while (notDone) {
+ notDone = ((e->_estAlignment[aPos] != 0) &&
+ (e->_genAlignment[aPos] != 0));
+
+ // If we find the end of a gapless block, emit a match
+
+ if (e->_estAlignment[aPos] == e->_genAlignment[aPos])
+ nIde[qPos]++;
+
+ else if (e->_estAlignment[aPos] == '-')
+ nDel[qPos]++;
+
+ else if (e->_genAlignment[aPos] == '-')
+ nIns[qPos]++;
+
+ else
+ nMis[qPos]++;
+
+ nTot[qPos]++;
+
+ assert(qPos < lMax);
+
+ if (lLen < qPos)
+ lLen = qPos;
+
+ //fprintf(stdout, "%s "uint32FMT" %c ->_ %s "uint32FMT" %c\n",
+ // p->_estDefLine, qPos, e->_estAlignment[aPos],
+ // p->_genDefLine, gPos, e->_genAlignment[aPos]);
+
+ if (e->_estAlignment[aPos] != '-')
+ if (fwd) qPos++;
+ else qPos--;
+ if (e->_genAlignment[aPos] != '-')
+ gPos++;
+
+ aPos++;
+ }
+ }
+ }
+
+
+ // Index
+ // nTot
+ // nIde, percent
+ // nDel, percent
+ // nIns, percent
+ // nMis, percent
+
+ fprintf(DAT, "#idx\tnTot\tnIde\tfrac\tnDel\tfrac\tnIns\tfrac\tnMis\tfrac\tnErr\tfrac\n");
+ for (uint32 i=0; i<=lLen; i++)
+ fprintf(DAT, "%u\t%u\t%u\t%6.4f\t%u\t%6.4f\t%u\t%6.4f\t%u\t%6.4f\t%u\t%6.4f\n",
+ i,
+ nTot[i],
+ nIde[i], (double)nIde[i] / nTot[i],
+ nDel[i], (double)nDel[i] / nTot[i],
+ nIns[i], (double)nIns[i] / nTot[i],
+ nMis[i], (double)nMis[i] / nTot[i],
+ nTot[i] - nIde[i], (double)(nTot[i] - nIde[i]) / nTot[i]);
+
+ fprintf(GNU, "set terminal png\n");
+ fprintf(GNU, "set output \"%s\"\n", pngName);
+ fprintf(GNU, "set title \"Fraction error per base for '%s'\"\n", inName);
+ fprintf(GNU, "set xlabel \"Base position\"\n");
+ fprintf(GNU, "set ylabel \"Fraction error\"\n");
+ fprintf(GNU, "plot [][0:0.04] \\\n");
+ fprintf(GNU, " \"%s\" using 1:4 with lines title \"nTot\", \\\n", datName);
+ fprintf(GNU, " \"%s\" using 1:6 with lines title \"nDel\", \\\n", datName);
+ fprintf(GNU, " \"%s\" using 1:8 with lines title \"nIns\", \\\n", datName);
+ fprintf(GNU, " \"%s\" using 1:10 with lines title \"nMis\", \\\n", datName);
+ fprintf(GNU, " \"%s\" using 1:12 with lines title \"nErr\"\n", datName);
+
+ fclose(DAT);
+ fclose(GNU);
+
+ sprintf(gnuCmd, "gnuplot < %s", gnuName);
+ system(gnuCmd);
+
+ return(0);
+}
+
diff --git a/sim4dbutils/s4p_overlap.C b/sim4dbutils/s4p_overlap.C
new file mode 100644
index 0000000..cc84c1d
--- /dev/null
+++ b/sim4dbutils/s4p_overlap.C
@@ -0,0 +1,41 @@
+#include "util++.H"
+#include "sim4.H"
+
+// Build an interval list with all exons (from both guys), merge
+// overlapping regions, compute the length, subtract from the total.
+// Result: the number of bp that the two matches overlap in the
+// genomic.
+//
+uint32
+findOverlap(sim4polish *A, sim4polish *B) {
+
+ if ((A->_genID != B->_genID) || (A->_matchOrientation != B->_matchOrientation))
+ return(0);
+
+ uint32 length = 0;
+ uint32 total = 0;
+ intervalList<uint64> IL;
+
+ for (uint32 i=0; i<A->_numExons; i++) {
+ length = A->_exons[i]._genTo - A->_exons[i]._genFrom + 1;
+ total += length;
+ IL.add(A->_exons[i]._genFrom, length);
+ }
+
+ for (uint32 i=0; i<B->_numExons; i++) {
+ length = B->_exons[i]._genTo - B->_exons[i]._genFrom + 1;
+ total += length;
+ IL.add(B->_exons[i]._genFrom, length);
+ }
+
+ IL.merge();
+
+#ifdef OLAP_IS_SHORT
+ if (total - IL.sumOfLengths() > 65536) {
+ fprintf(stderr, "findOverlap()-- ERROR! The overlap is larger than the return type!\n");
+ fprintf(stderr, "findOverlap()-- Switch to 32-bit ints in s4p_overlap.H.\n");
+ }
+#endif
+
+ return(total - IL.sumOfLengths());
+}
diff --git a/sim4dbutils/s4p_overlap.H b/sim4dbutils/s4p_overlap.H
new file mode 100644
index 0000000..be15e1f
--- /dev/null
+++ b/sim4dbutils/s4p_overlap.H
@@ -0,0 +1,20 @@
+#ifndef S4P_OVERLAP_H
+#define S4P_OVERLAP_H
+
+// Using 16-bit ints for storing the amount overlapped gives a big
+// memory reduction, but will fail for long sequences (mRNA,
+// probably). findOverlap() checks for overflow.
+//
+#define OLAP_IS_SHORT
+
+#ifdef OLAP_IS_SHORT
+typedef uint16 olap_t;
+#define OLAPTFMT uint16FMT
+#else
+typedef uint32 olap_t;
+#define OLAPTFMT uint32FMT
+#endif
+
+olap_t findOverlap(sim4polish *A, sim4polish *B);
+
+#endif // S4P_OVERLAP_H
diff --git a/sim4dbutils/sortPolishes.C b/sim4dbutils/sortPolishes.C
new file mode 100644
index 0000000..e139919
--- /dev/null
+++ b/sim4dbutils/sortPolishes.C
@@ -0,0 +1,368 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include "sim4.H"
+#include "bio.h"
+#include "util.h"
+
+// Sorts a file of polishes by cDNA or genomic idx.
+
+sim4polishReader *
+writeTemporary(char *filePrefix, sim4polish **p, uint32 pLen, sim4polishStyle style, int (*fcn)(const void *, const void *)) {
+ sim4polishWriter *W = new sim4polishWriter(0L, style, true);
+ sim4polishReader *R;
+
+ qsort(p, pLen, sizeof(sim4polish *), fcn);
+
+ for (uint32 i=0; i<pLen; i++)
+ W->writeAlignment(p[i]);
+
+ R = new sim4polishReader(0L, W);
+
+ delete W;
+
+ return(R);
+}
+
+// Save the polish using palloc;
+//
+sim4polish *
+savePolish(sim4polish *q, uint64 *alloc) {
+ int l;
+
+ // Copy the base polish structure.
+ //
+ sim4polish *r = (sim4polish *)palloc(sizeof(sim4polish));
+ memcpy(r, q, sizeof(sim4polish));
+ *alloc += sizeof(sim4polish);
+
+ // Copy the deflines.
+ //
+ if (q->_estDefLine && q->_genDefLine) {
+ l = strlen(q->_estDefLine) + 1;
+ r->_estDefLine = (char *)palloc(sizeof(char) * l);
+ memcpy(r->_estDefLine, q->_estDefLine, sizeof(char) * l);
+ *alloc += l * sizeof(char);
+
+ l = strlen(q->_genDefLine) + 1;
+ r->_genDefLine = (char *)palloc(sizeof(char) * l);
+ memcpy(r->_genDefLine, q->_genDefLine, sizeof(char) * l);
+ *alloc += l * sizeof(char);
+ }
+
+ // Copy the base exon structure.
+ //
+ r->_exons = (sim4polishExon *)palloc(sizeof(sim4polishExon) * q->_numExons);
+ memcpy(r->_exons, q->_exons, sizeof(sim4polishExon) * q->_numExons);
+ *alloc += sizeof(sim4polishExon) * q->_numExons;
+
+ // Copy the exon alignments.
+ //
+ for (uint32 i=0; i<q->_numExons; i++) {
+ if (q->_exons[i]._estAlignment) {
+ l = strlen(q->_exons[i]._estAlignment) + 1;
+ r->_exons[i]._estAlignment = (char *)palloc(sizeof(char) * l);
+ memcpy(r->_exons[i]._estAlignment, q->_exons[i]._estAlignment, sizeof(char) * l);
+ *alloc += l * sizeof(char);
+ }
+
+ if (q->_exons[i]._genAlignment) {
+ l = strlen(q->_exons[i]._genAlignment) + 1;
+ r->_exons[i]._genAlignment = (char *)palloc(sizeof(char) * l);
+ memcpy(r->_exons[i]._genAlignment, q->_exons[i]._genAlignment, sizeof(char) * l);
+ *alloc += l * sizeof(char);
+ }
+ }
+
+ return(r);
+}
+
+
+void
+statusReport(uint32 pLen, uint32 mergeFilesLen, uint64 arrayAlloc, uint64 matchAlloc, uint64 upperAlloc) {
+ if (pLen > 0) {
+ fprintf(stderr, "Read: "uint32FMTW(8)" polishes -- "uint32FMTW(5)" temporary files -- "uint64FMTW(5)"MB / "uint64FMTW(5)"MB -- "uint64FMTW(5)" bytes/polish\r",
+ pLen,
+ mergeFilesLen,
+ (arrayAlloc + matchAlloc) >> 20,
+ upperAlloc >> 20,
+ matchAlloc / pLen);
+ fflush(stderr);
+ }
+}
+
+
+
+
+// The OS limit is usually hit before this, but this is
+// the maximum number of files we can have open at once.
+//
+//#define MERGE_FILES_MAX OPEN_MAX
+
+
+int
+main(int argc, char **argv) {
+ bool beVerbose = false;
+ char *filePrefix = NULL;
+
+ uint32 pLen = 0;
+ uint32 pMax = 1 * 1024 * 1024;
+
+ uint64 upperAlloc = getProcessSizeLimit(); // Maximum allowed memory usage
+ uint64 arrayAlloc = 0; // Static stuff: the process, arrays
+ uint64 matchAlloc = 0; // palloc size, matches
+
+ int (*fcn)(const void *, const void *) = 0L;
+
+ bool moreInput = true;
+
+ uint32 mergeFilesLen = 0;
+ uint32 mergeFilesMax = sysconf(_SC_OPEN_MAX);
+ sim4polishReader **mergeFiles = new sim4polishReader * [mergeFilesMax];
+ char **mergeNames = new char * [mergeFilesMax];
+
+ sim4polishStyle style = sim4polishStyleDefault;
+
+
+ if ((mergeFiles == 0L) || (mergeNames == 0L)) {
+ fprintf(stderr, "sortPolishes: Failed to initialize.\n");
+ exit(1);
+ }
+ for (uint32 i=0; i<mergeFilesMax; i++) {
+ mergeFiles[i] = NULL;
+ mergeNames[i] = NULL;
+ }
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-v", 2) == 0) {
+ beVerbose = true;
+
+ } else if (strncmp(argv[arg], "-c", 2) == 0) {
+ fcn = s4p_estIDcompare;
+
+ } else if (strncmp(argv[arg], "-g", 2) == 0) {
+ fcn = s4p_genIDcompare;
+
+ } else if (strncmp(argv[arg], "-C", 2) == 0) {
+ fcn = s4p_estDEFcompare;
+
+ } else if (strncmp(argv[arg], "-G", 2) == 0) {
+ fcn = s4p_genDEFcompare;
+
+ } else if (strncmp(argv[arg], "-m", 2) == 0) {
+ arg++;
+ upperAlloc = atoi(argv[arg]);
+ upperAlloc *= 1048576;
+
+ } else if (strncmp(argv[arg], "-t", 2) == 0) {
+ arg++;
+ filePrefix = argv[arg];
+
+ } else if (strcmp(argv[arg], "-gff3") == 0) {
+ style = sim4polishGFF3;
+
+ } else if (strncmp(argv[arg], "-M", 2) == 0) {
+ arg++;
+ while ((arg < argc) && (fileExists(argv[arg]))) {
+ if (mergeFilesLen >= mergeFilesMax) {
+ fprintf(stderr, "%s: ERROR! Too many input files! Should be less than %d\n", argv[0], mergeFilesMax);
+ exit(1);
+ }
+ mergeNames[mergeFilesLen] = argv[arg];
+ mergeFiles[mergeFilesLen++] = new sim4polishReader(argv[arg]);
+ arg++;
+ }
+ arg--;
+
+ } else {
+ fprintf(stderr, "unknown option: %s\n", argv[arg]);
+ err++;
+ }
+
+ arg++;
+ }
+ if ((err) ||
+ (fcn == 0L) ||
+ ((mergeFilesLen == 0) && (isatty(fileno(stdin))))) {
+ fprintf(stderr, "usage: %s [-c | -g] [-m M] [-t T] [-gff3] [-M [file ...]]\n", argv[0]);
+ fprintf(stderr, " -c (-C) Sort by the cDNA index (defline).\n");
+ fprintf(stderr, " -g (-G) Sort by the genomic index (defline).\n");
+ fprintf(stderr, " -M Skip the sort, just do a merge.\n");
+ fprintf(stderr, " -m M Use at most M MB of core, using a disk-based merge if memory\n");
+ fprintf(stderr, " is exhausted. Default: 4096.\n");
+ fprintf(stderr, " -t T Use directory 'T' for temporary files. Default is the current\n");
+ fprintf(stderr, " working directory. The sort unlinks files immediately after\n");
+ fprintf(stderr, " creation: no files will exist, but space will be used.\n");
+ fprintf(stderr, " -gff3 Format output as GFF3.\n");
+ fprintf(stderr, " -v Be verbose.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Both sort methods use the OTHER index as a secondary key.\n");
+
+ if (fcn == 0L)
+ fprintf(stderr, "\nERROR: what key do you want to sort on (-c, -g, -C, -G)\n");
+
+ if ((mergeFilesLen == 0) && (isatty(fileno(stdin))))
+ fprintf(stderr, "\nERROR: no files to merge\n");
+
+ exit(1);
+ }
+
+ if (mergeFilesLen > 0)
+ fprintf(stderr, "Found %d files to merge!\n", mergeFilesLen);
+
+
+ // XXX: Experimental method to automagically determine the amount of memory available (or, to at
+ // least, determine if this process can get to be as big as the user said it can.
+ //
+ arrayAlloc = getProcessSizeCurrent();
+
+ sim4polish **p = new sim4polish * [pMax];
+ memset(p, 0, sizeof(sim4polish *) * pMax);
+
+ arrayAlloc += sizeof(sim4polish *) * pMax;
+
+
+ // With small memory sizes, we occasionally run out of data space. This looks like an artifact
+ // of not having palloc() use a blocksize that divides our upperAlloc size. This attempts to
+ // sync them up.
+ //
+ psetblocksize(upperAlloc / 16); // This produced a crash in readBuffer
+ //psetdebug(2);
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *q = 0L;
+
+ if (R->getsim4polishStyle() != style)
+ fprintf(stderr, "warning: input format and output format differ.\n");
+
+ while (R->nextAlignment(q)) {
+
+ // Allocate more pointer space, if we need to
+ //
+ if ((pLen >= pMax) ||
+ (arrayAlloc + matchAlloc >= upperAlloc)) {
+
+ // Either realloc space (if we're still small enough to do so) or
+ // write an intermediate file.
+
+ if (arrayAlloc + matchAlloc + sizeof(sim4polish*) * pMax * 2 < upperAlloc) {
+ sim4polish **P = new sim4polish * [pMax * 2];
+ memcpy(P, p, sizeof(sim4polish *) * pMax);
+ delete [] p;
+ pMax *= 2;
+ p = P;
+ arrayAlloc += sizeof(sim4polish *) * pMax;
+
+ } else {
+ if (beVerbose) {
+ statusReport(pLen, mergeFilesLen+1, arrayAlloc, matchAlloc, upperAlloc);
+ fprintf(stderr, "\n");
+ }
+
+ if (mergeFilesLen >= mergeFilesMax) {
+ fprintf(stderr, "Too many open files. Try increasing memory size.\n");
+ exit(1);
+ }
+ mergeFiles[mergeFilesLen++] = writeTemporary(filePrefix, p, pLen, style, fcn);
+
+ pfree();
+ matchAlloc = 0;
+ pLen = 0;
+ }
+ }
+
+ p[pLen++] = savePolish(q, &matchAlloc); // COPY the polish.
+
+ if (beVerbose && ((pLen % 2000) == 0))
+ statusReport(pLen, mergeFilesLen+1, arrayAlloc, matchAlloc, upperAlloc);
+ }
+
+ if (beVerbose) {
+ statusReport(pLen, mergeFilesLen+1, arrayAlloc, matchAlloc, upperAlloc);
+ fprintf(stderr, "\n");
+ }
+
+ sim4polishWriter *W = new sim4polishWriter("-", style);
+
+ if (mergeFilesLen == 0) {
+ // No temporary files. Sort the polishes, and dump.
+ qsort(p, pLen, sizeof(sim4polish *), fcn);
+
+ for (uint32 i=0; i<pLen; i++)
+ W->writeAlignment(p[i]);
+ } else {
+
+ // Crud. Temporary files. Sort the last batch, dump it, then do
+ // a merge.
+ //
+ if (mergeFilesLen >= mergeFilesMax) {
+ fprintf(stderr, "Too many open files. Try increasing memory size.\n");
+ exit(1);
+ }
+ mergeFiles[mergeFilesLen++] = writeTemporary(filePrefix, p, pLen, style, fcn);
+
+ pfree();
+ matchAlloc = 0;
+ pLen = 0;
+
+ delete [] p;
+ }
+
+ //
+ // The merge
+ //
+
+ if (mergeFilesLen > 0) {
+ if (beVerbose)
+ fprintf(stderr, "Merging temporary files....\n");
+
+ sim4polish **p = new sim4polish * [mergeFilesLen];
+
+ memset(p, 0, sizeof(sim4polish *) * mergeFilesLen);
+
+ for (uint32 i=0; i<mergeFilesLen; i++)
+ mergeFiles[i]->nextAlignment(p[i]);
+
+ while (moreInput) {
+ uint32 smallestPolish = 0;
+
+ // Find the smallest polish.
+ //
+ for (uint32 nextPolish = smallestPolish+1; nextPolish < mergeFilesLen; nextPolish++) {
+ if ((*fcn)(p+smallestPolish, p+nextPolish) > 0)
+ smallestPolish = nextPolish;
+ }
+
+ // If the smallestPolish is 0L, we're all done. Otherwise, dump
+ // the current smallest and fill it with a new polish.
+ //
+ if (p[smallestPolish] == 0L) {
+ moreInput = false;
+ } else {
+ W->writeAlignment(p[smallestPolish]);
+ mergeFiles[smallestPolish]->nextAlignment(p[smallestPolish]);
+ }
+ }
+
+ // Attempt cleanup
+ //
+ for (uint32 i=0; i<mergeFilesLen; i++)
+ delete mergeFiles[i];
+
+ delete [] p;
+ }
+
+ delete W;
+
+ delete [] mergeFiles;
+ delete [] mergeNames;
+
+ pfree();
+
+ return(0);
+}
diff --git a/sim4dbutils/summarizePolishes.C b/sim4dbutils/summarizePolishes.C
new file mode 100644
index 0000000..8508fbd
--- /dev/null
+++ b/sim4dbutils/summarizePolishes.C
@@ -0,0 +1,253 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "bio++.H"
+//#include "fasta.H"
+#include "sim4.H"
+
+using namespace std;
+#include <vector>
+
+//
+// Current ESTmapper generated summary is:
+//
+// GOOD: >= 95% identity, >= 80% composite, >= 0 bp
+// cDNA-genomic matches 28715039 matches (24921387 different cDNA and 81 genomic)
+// Matches per cDNA 1.1522 matches/cDNA
+// Matches per genomic 354506.6543 matches/genomic
+//
+// cDNA COUNTS:
+// cDNA: 27440540
+// cDNA-good: 24921387 ( 90.8196%)
+// cDNA-missing: 26071 ( 0.0950%)
+// cDNA-zero: 2493082 ( 9.0854%)
+//
+//
+// New format / summary should be
+//
+//
+// X% identity coverage: 50 55 60 65 70 75 80 85 90 95 100
+// sequence-genomic matches %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u
+// Unique sequences %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u
+// Matches per sequence %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u
+// Unique genomic %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u
+// Matches per genomic %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u
+//
+// usage:
+//
+// Report raw numbers at 90, 95, 99 percent identity, 50, 60, 70, 80, 90,
+// 100 percent coverage:
+// summarizePolishes -i 90 95 99 -c 50 60 70 80 90 100 -p polishes-file
+//
+// Report percentages at same
+// summarizePolishes -i 90 95 99 -c 50 60 70 80 90 100 -nf cdna-file -p polishes-file
+// summarizePolishes -i 90 95 99 -c 50 60 70 80 90 100 -n num-seqs -p polishes-file
+//
+// Read from stdin, default to 95 percent identity, 50 percent coverage:
+// summarizePolishes -p -
+//
+// Buckets? Cumulative? Both? If we do buckets with size 1, we'll
+// use lots of space, but be fast. Compute correct bucket sizes on
+// output. 101*101 entries, 6 million sequences -> 190GB.
+//
+// So keep sorted list of values, find first bucket that is <= the
+// match we have. 792MB for the example below (3 %i, 11 %c, 6 million
+// seqs).
+//
+// Read in all %i,%c. Compute each identity x coverage pair
+// separately. 48MB for scores + 24MB for a pair. Memory efficient,
+// maybe not compute efficient.
+//
+
+struct match {
+ uint32 _estid;
+ uint32 _genid;
+ uint32 _identity;
+ uint32 _coverage;
+};
+
+
+
+void
+readMatches(char *filename,
+ vector<match> &matches) {
+
+ sim4polishReader *R = 0L;
+
+ if ((filename != 0L) && (strcmp(filename, "-") != 0)) {
+
+ fprintf(stderr, "Reading matches from '%s'\n", filename);
+ R = new sim4polishReader(filename);
+
+ } else {
+
+ fprintf(stderr, "Reading matches from 'stdin'\n");
+ R = new sim4polishReader("-");
+ }
+
+ matches.clear();
+
+ sim4polish *p = 0L;
+
+ while (R->nextAlignment(p)) {
+ match m;
+
+ m._estid = p->_estID;
+ m._genid = p->_genID;
+ m._identity = p->_percentIdentity;
+ m._coverage = p->_querySeqIdentity;
+
+ matches.push_back(m);
+ }
+
+ delete R;
+
+ fprintf(stderr, "read %d matches.\n", (int)matches.size());
+}
+
+
+
+int
+main(int argc, char **argv) {
+ char *polishesFile = 0L;
+ uint32 numSeqs = 0;
+ char *sequenceFile = 0L;;
+ uint32 idLen = 0;
+ uint32 id[101] = { 0 };
+ uint32 cvLen = 0;
+ uint32 cv[101] = { 0 };
+ bool formatExcel = false;
+
+ if (argc == 1) {
+ fprintf(stderr, "usage: %s [-excel] [-p polishes-file] [-n num-seqs | -nf seq-file] [-i val ...] [-c val ...]\n", argv[0]);
+ exit(1);
+ }
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-polishes", 2) == 0) {
+ polishesFile = argv[++arg];
+ } else if (strncmp(argv[arg], "-n", 3) == 0) {
+ numSeqs = atoi(argv[++arg]);
+ } else if (strncmp(argv[arg], "-nf", 3) == 0) {
+ sequenceFile = argv[++arg];
+ } else if (strncmp(argv[arg], "-identity", 2) == 0) {
+ arg++;
+ while ((argv[arg]) && (argv[arg][0] != '-'))
+ id[idLen++] = atoi(argv[arg++]);
+ arg--;
+ } else if (strncmp(argv[arg], "-coverage", 2) == 0) {
+ arg++;
+ while ((argv[arg]) && (argv[arg][0] != '-'))
+ cv[cvLen++] = atoi(argv[arg++]);
+ arg--;
+ } else if (strncmp(argv[arg], "-excel", 2) == 0) {
+ formatExcel=true;
+ }
+ arg++;
+ }
+
+ if (polishesFile == 0L) {
+ fprintf(stderr, "ERROR: No polishes file specified!\n");
+ exit(1);
+ }
+
+ if (idLen == 0) {
+ fprintf(stderr, "WARNING: Defaulting to 95%% identity.\n");
+ id[idLen++] = 95;
+ }
+
+ if (cvLen == 0) {
+ fprintf(stderr, "WARNING: Defaulting to 50%% coverage.\n");
+ cv[cvLen++] = 50;
+ }
+
+
+ fprintf(stderr, "Polishes: %s\n", polishesFile);
+ fprintf(stderr, "numSeqs: "uint32FMT"\n", numSeqs);
+ fprintf(stderr, "sequenceFile: %s\n", sequenceFile);
+ fprintf(stderr, "ids: "uint32FMT" -- ", idLen);
+ for (uint32 i=0; i<idLen; i++)
+ fprintf(stderr, " "uint32FMT"", id[i]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "cvs: "uint32FMT" -- ", cvLen);
+ for (uint32 i=0; i<cvLen; i++)
+ fprintf(stderr, " "uint32FMT"", cv[i]);
+ fprintf(stderr, "\n");
+
+ vector<match> matches;
+
+ readMatches(polishesFile, matches);
+
+ // Find the largest cDNA and genomic idx
+ //
+ uint32 estmax = 0;
+ uint32 genmax = 0;
+ for (uint32 i=0; i<matches.size(); i++) {
+ if (estmax < matches[i]._estid)
+ estmax = matches[i]._estid;
+ if (genmax < matches[i]._genid)
+ genmax = matches[i]._genid;
+ }
+
+ estmax++;
+ genmax++;
+
+ // Allocate space for statistics
+ //
+ uint32 *estcounts = new uint32 [estmax];
+ uint32 *gencounts = new uint32 [genmax];
+ uint32 mapped;
+ uint32 notmapped;
+ uint32 uniqest;
+ uint32 uniqgen;
+
+ if (formatExcel) {
+ fprintf(stdout, "identity\tcoverage\tmapped\tnotmapped\tuniqest\tuniqgen\n");
+ fflush(stdout);
+ }
+
+ // Foreach identity and each coverage, find how many things
+ // are above that level.
+ //
+ for (uint32 i=0; i<idLen; i++) {
+ for (uint32 c=0; c<cvLen; c++) {
+ mapped = 0;
+ notmapped = 0;
+ for (uint32 z=0; z<estmax; z++)
+ estcounts[z] = 0;
+ for (uint32 z=0; z<genmax; z++)
+ gencounts[z] = 0;
+
+ for (uint32 z=0; z<matches.size(); z++) {
+ if ((id[i] <= matches[z]._identity) &&
+ (cv[c] <= matches[z]._coverage)) {
+ mapped++;
+ estcounts[ matches[z]._estid ]++;
+ gencounts[ matches[z]._genid ]++;
+ } else {
+ notmapped++;
+ }
+ }
+
+ uniqest = 0;
+ uniqgen = 0;
+
+ for (uint32 z=0; z<estmax; z++)
+ if (estcounts[z])
+ uniqest++;
+ for (uint32 z=0; z<genmax; z++)
+ if (gencounts[z])
+ uniqgen++;
+
+ if (formatExcel) {
+ fprintf(stdout, uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\n", id[i], cv[c], mapped, notmapped, uniqest, uniqgen);
+ fflush(stdout);
+ } else {
+ fprintf(stdout, uint32FMTW(3)" "uint32FMTW(3)": mapped="uint32FMTW(8)" notmapped="uint32FMTW(8)" est="uint32FMTW(8)" gen="uint32FMTW(8)"\n", id[i], cv[c], mapped, notmapped, uniqest, uniqgen);
+ fflush(stdout);
+ }
+ }
+ }
+}
+
diff --git a/sim4dbutils/test/parsesnp-correct-parsed b/sim4dbutils/test/parsesnp-correct-parsed
new file mode 100644
index 0000000..77c7688
--- /dev/null
+++ b/sim4dbutils/test/parsesnp-correct-parsed
@@ -0,0 +1,8 @@
+rev_comp_end_of_first_exon genomicTestChunk 51 C/X complement global[98 98] exon[2 0 98 49]
+forward_end_of_first_exon genomicTestChunk 51 C/X forward global[98 98] exon[2 0 98 49]
+rev_comp_start_of_second_exon genomicTestChunk 643 C/Y complement global[98 98] exon[2 1 98 48]
+forward_start_of_second_exon genomicTestChunk 643 C/Y forward global[98 98] exon[2 1 98 48]
+reverse_end_of_second_exon genomicTestChunk 692 g/g complement global[98 98] exon[2 1 98 48]
+forward_end_of_second_exon genomicTestChunk 692 g/g forward global[98 98] exon[2 1 98 48]
+reverse_start_of_first_exon genomicTestChunk 1 t/t complement global[98 98] exon[2 0 98 49]
+forward_start_of_first_exon genomicTestChunk 1 t/t forward global[98 98] exon[2 0 98 49]
diff --git a/sim4dbutils/test/parsesnp-gen.fasta b/sim4dbutils/test/parsesnp-gen.fasta
new file mode 100644
index 0000000..1d86a93
--- /dev/null
+++ b/sim4dbutils/test/parsesnp-gen.fasta
@@ -0,0 +1,10 @@
+>genomicTestChunk
+TGACCACTGGCCCCTTGTCAATGGGCTTGCCGGAGCCATTGAAAACCCGAXCTGAGGGTGGACGAGGAGTGTTGCAGGGT
+GCTCAGGCTAGCCCTGTGTCCCTCACTACTGTCTACCCTCCACACCACCACCAGCTCCCACCCACTCCCCACAGGAGTGC
+CCTGTATCCCCCTCCCCGGCAGCCCACAGGTAACCCAGAGAGCCAGCTACAAGGACTGTCCTGTGAGAGTCTTCCTTCCT
+CTCCTGGAAACCTTTACAGGCAAGGCCTTGGCCCAGGACCATGACTCTAATGGGGGATTCCAGGACTAGAGAGAGGAGAG
+GGCCAGGCCAGGTCTTGGGAGAGAATTAGGGGATATTCAAGGCTTAGCAGTCCCTTTCAGCCTGGCTCCAAATAGGTGGC
+TACCTACTGTGTCTCTAGTTTATTGAACCCACCTTCCTCATTTATTGAACACCTGTATGTCAGACCCTGAGCTGGGGTCA
+GGAATCAGAGAGAAGACATACCCCAGACCTCAAAGAGCCCACCAGAGAGACAGGCAGGAAGTAAACAGGAAGTGACAGTG
+TAGTGTGCTGAGGGTTTGGTCAGAGAAGCTGGGCTGGGAGGGCAGAGGAGCACCCCCACCCCTGCCCCTCCCTGTCCCTC
+ACYCAGCATGTCCTCTGACACCGGAGTTCGTAGGATGTCCCCTGTAAATTCG
diff --git a/sim4dbutils/test/parsesnp-snp.fasta b/sim4dbutils/test/parsesnp-snp.fasta
new file mode 100644
index 0000000..1302fed
--- /dev/null
+++ b/sim4dbutils/test/parsesnp-snp.fasta
@@ -0,0 +1,32 @@
+>rev_comp_end_of_first_exon /pos=50 /size=1 /genotype=G_T
+CGAATTTACAGGGGACATCCTACGAACTCCGGTGTCAGAGGACATGCTGG
+G
+TCGGGTTTTCAATGGCTCCGGCAAGCCCATTGACAAGGGGCCAGTGGTCA
+>forward_end_of_first_exon /pos=50 /size=1 /genotype=G_T
+TGACCACTGGCCCCTTGTCAATGGGCTTGCCGGAGCCATTGAAAACCCGA
+C
+CCAGCATGTCCTCTGACACCGGAGTTCGTAGGATGTCCCCTGTAAATTCG
+>rev_comp_start_of_second_exon /pos=49 /size=1 /genotype=G_T
+CGAATTTACAGGGGACATCCTACGAACTCCGGTGTCAGAGGACATGCTGG
+G
+TCGGGTTTTCAATGGCTCCGGCAAGCCCATTGACAAGGGGCCAGTGGTCA
+>forward_start_of_second_exon /pos=51 /size=1 /genotype=G_T
+TGACCACTGGCCCCTTGTCAATGGGCTTGCCGGAGCCATTGAAAACCCGA
+C
+CCAGCATGTCCTCTGACACCGGAGTTCGTAGGATGTCCCCTGTAAATTCG
+>reverse_end_of_second_exon /pos=0 /size=1 /genotype=G_T
+CGAATTTACAGGGGACATCCTACGAACTCCGGTGTCAGAGGACATGCTGG
+G
+TCGGGTTTTCAATGGCTCCGGCAAGCCCATTGACAAGGGGCCAGTGGTCA
+>forward_end_of_second_exon /pos=100 /size=1 /genotype=G_T
+TGACCACTGGCCCCTTGTCAATGGGCTTGCCGGAGCCATTGAAAACCCGA
+C
+CCAGCATGTCCTCTGACACCGGAGTTCGTAGGATGTCCCCTGTAAATTCG
+>reverse_start_of_first_exon /pos=100 /size=1 /genotype=G_T
+CGAATTTACAGGGGACATCCTACGAACTCCGGTGTCAGAGGACATGCTGG
+G
+TCGGGTTTTCAATGGCTCCGGCAAGCCCATTGACAAGGGGCCAGTGGTCA
+>forward_start_of_first_exon /pos=0 /size=1 /genotype=G_T
+TGACCACTGGCCCCTTGTCAATGGGCTTGCCGGAGCCATTGAAAACCCGA
+C
+CCAGCATGTCCTCTGACACCGGAGTTCGTAGGATGTCCCCTGTAAATTCG
diff --git a/sim4dbutils/trimExons.C b/sim4dbutils/trimExons.C
new file mode 100644
index 0000000..94bae23
--- /dev/null
+++ b/sim4dbutils/trimExons.C
@@ -0,0 +1,210 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "sim4reader.h"
+
+#define SHOWTRIMMING
+
+char const *usage =
+"usage: %s [-save trimmedFile]\n"
+" -savetrimming Saves a before/after of each trimmed match.\n"
+" All matches are printed to stdout (untrimmed and trimmed).\n"
+"\n";
+
+int
+main(int argc, char ** argv) {
+ int arg = 1;
+ FILE *trimmedFile = 0L;
+ int beVerbose = 0;
+ sim4polish *p;
+ int polishesProcessed = 0;
+ int polishesTrimmed = 0;
+
+ if (isatty(fileno(stdin)) || isatty(fileno(stdout))) {
+ fprintf(stderr, usage, argv[0]);
+
+ if (isatty(fileno(stdin)))
+ fprintf(stderr, "error: I cannot read polishes from the terminal!\n\n");
+
+ if (isatty(fileno(stdout)))
+ fprintf(stderr, "error: Please redirect the polishes to a file.\n (They are on stdout)\n\n");
+
+ exit(1);
+ }
+
+ arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-savetrimming", 2) == 0) {
+ arg++;
+ errno=0;
+ trimmedFile = fopen(argv[arg], "w");
+ if (errno) {
+ fprintf(stderr, "Can't open '%s' for writing\n%s\n", argv[arg], strerror(errno));
+ exit(1);
+ }
+ } else if (strncmp(argv[arg], "-verbose", 2) == 0) {
+ beVerbose = 1;
+ }
+
+ arg++;
+ }
+
+ while ((p = readPolish(stdin)) != 0L) {
+ int trimFirst = 0;
+ int trimLast = 0;
+
+ /* Decide if we need to trim anything
+ */
+ if (p->numExons > 1) {
+ int exA;
+ int exB;
+ int dist;
+ int qual;
+ int size;
+
+ exA = 0; // First exon
+ exB = 1; // Second exon
+ dist = p->exons[exB].genFrom - p->exons[exA].genTo + 1;
+ qual = p->exons[exA].percentIdentity;
+ size = p->exons[exA].estTo - p->exons[exA].estFrom + 1;
+
+ trimFirst = 1;
+
+ if (dist < 100000)
+ trimFirst = 0;
+
+ if (size >= 50)
+ trimFirst = 0;
+
+ if (size >= 25 + (int)((dist - 100000) * 25.0 / 900000.0))
+ trimFirst = 0;
+
+ if ((qual >= 98) &&
+ (size >= 25 + (int)((dist - 100000) * 25.0 / 1400000.0)))
+ trimFirst = 0;
+
+ // Reverse our decision if the first exon is of low quality.
+ //
+ if ((qual < 85) && (dist >= 10000)) {
+ if (trimFirst == 0)
+ fprintf(trimmedFile, "Trimming frist exon based only on percent ID\n");
+ trimFirst = 1;
+ }
+
+ exA = p->numExons - 1; // Last exon
+ exB = p->numExons - 2; // Second to last
+ dist = p->exons[exA].genFrom - p->exons[exB].genTo + 1;
+ qual = p->exons[exA].percentIdentity;
+ size = p->exons[exA].estTo - p->exons[exA].estFrom + 1;
+
+ trimLast = 1;
+
+ if (dist < 100000)
+ trimLast = 0;
+
+ if (size >= 50)
+ trimLast = 0;
+
+ if (size >= 25 + (int)((dist - 100000) * 25.0 / 900000.0))
+ trimLast = 0;
+
+ if ((qual >= 98) &&
+ (size >= 25 + (int)((dist - 100000) * 25.0 / 1400000.0)))
+ trimLast = 0;
+
+ // Reverse our decision if the first exon is of low quality.
+ //
+ if ((qual < 85) && (dist >= 10000)) {
+ if (trimLast == 0)
+ fprintf(trimmedFile, "Trimming last exon based only on percent ID\n");
+ trimLast = 1;
+ }
+ }
+
+ if (trimmedFile && (trimFirst || trimLast)) {
+ fprintf(trimmedFile, "------------------------------------------------------------BEFORE\n");
+ printPolish(trimmedFile, p);
+ }
+
+
+ if (beVerbose) {
+ polishesProcessed++;
+ if (trimFirst || trimLast)
+ polishesTrimmed++;
+ if ((polishesProcessed % 10000) == 0) {
+ fprintf(stderr, " %d processed, %d trimmed (%8.5f%%)\r",
+ polishesProcessed, polishesTrimmed,
+ 100.0 * (double)polishesTrimmed / (double)polishesProcessed);
+ fflush(stderr);
+ }
+ }
+
+
+ // If there is one intron, and we've been asked to remove
+ // either the first or the last (it should say to remove
+ // both), then remove the shorter of the two.
+ //
+ if ((trimFirst || trimLast) && (p->numExons == 2)) {
+ trimFirst = 0;
+ trimLast = 0;
+
+ if ((p->exons[0].estTo - p->exons[0].estFrom) > (p->exons[1].estTo - p->exons[1].estFrom))
+ trimLast = 1;
+ else
+ trimFirst = 1;
+ }
+
+
+
+ // Remove the first exon, by circularly shifting the list of
+ // exons. The exon trimmed from the start is moved to the end of
+ // the exon list.
+ //
+ if (trimFirst) {
+ int i;
+ sim4polishExon save;
+
+ memcpy(&save, p->exons+0, sizeof(sim4polishExon));
+
+ for (i=1; i<p->numExons; i++)
+ memcpy(p->exons+i-1, p->exons+i, sizeof(sim4polishExon));
+
+ memcpy(p->exons+p->numExons-1, &save, sizeof(sim4polishExon));
+
+ p->numExons--;
+ }
+
+
+ // Trimming the last exon is easy; just decrement the size of the
+ // list.
+ //
+ if (trimLast) {
+ p->numExons--;
+
+ // We also need to clear the intron orientation flag in the new
+ // last exon
+ //
+ p->exons[p->numExons-1].intronOrientation = INTRON_NONE;
+ }
+
+ if (trimmedFile && (trimFirst || trimLast)) {
+ fprintf(trimmedFile, "------------------------------------------------------------AFTER\n");
+ printPolish(trimmedFile, p);
+ fprintf(trimmedFile, "============================================================EOP\n");
+ }
+
+ printPolish(stdout, p);
+
+ // Insert the exons back in, so they will be destroyed properly.
+ //
+ if (trimFirst) p->numExons++;
+ if (trimLast) p->numExons++;
+
+ destroyPolish(p);
+ }
+
+ return(0);
+}
diff --git a/sim4dbutils/trimSequencesBasedOnMatches.C b/sim4dbutils/trimSequencesBasedOnMatches.C
new file mode 100644
index 0000000..301ac62
--- /dev/null
+++ b/sim4dbutils/trimSequencesBasedOnMatches.C
@@ -0,0 +1,138 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+#include "bri++.H"
+#include "sim4reader.h"
+
+int
+main(int argc, char ** argv) {
+ FastA *seqs = 0L;
+ FastABuffer seqsbuffer;
+ FILE *pfile = 0L;
+ sim4polish *p = 0L;
+
+ if (argc == 1) {
+ fprintf(stderr, "usage: %s -sequence s.fasta -polishes p.polished\n", argv[0]);
+ exit(1);
+ }
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-sequence", 2) == 0) {
+ seqs = new FastA(argv[++arg], true);
+ } else if (strncmp(argv[arg], "-polishes", 2) == 0) {
+ errno = 0;
+ pfile = fopen(argv[++arg], "r");
+ if (errno) {
+ fprintf(stderr, "Can't open '%s': %s\n", argv[arg], strerror(errno));
+ exit(1);
+ }
+ } else {
+ fprintf(stderr, "%s: unknown option '%s'\n", argv[arg]);
+ }
+
+ arg++;
+ }
+
+ if (seqs == 0L) {
+ fprintf(stderr, "error: you need to specify '-sequence s.fasta'\n");
+ exit(1);
+ }
+
+ if (pfile == 0L) {
+ fprintf(stderr, "error: you need to specify '-polishes p.polished'\n");
+ exit(1);
+ }
+
+ uint32 numseqs = seqs->numberOfSequences();
+ uint32 *lrange = new uint32 [numseqs];
+ uint32 *hrange = new uint32 [numseqs];
+
+ for (uint32 i=0; i<numseqs; i++) {
+ lrange[i] = ~uint32ZERO;
+ hrange[i] = uint32ZERO;
+ }
+
+ uint32 numRead = 0;
+
+ while ((p = readPolish(pfile)) != 0L) {
+ if (lrange[p->estID] > p->exons[0].estFrom-1)
+ lrange[p->estID] = p->exons[0].estFrom-1;
+
+ if (hrange[p->estID] < p->exons[p->numExons-1].estTo)
+ hrange[p->estID] = p->exons[p->numExons-1].estTo;
+
+ numRead++;
+ if ((numRead & 0xff) == 0) {
+ fprintf(stderr, "Reading matches: %u\r", numRead);
+ fflush(stderr);
+ }
+
+ destroyPolish(p);
+ }
+
+ fprintf(stderr, "\n");
+
+ uint32 seqcopylen = 128 * 1024;
+ char *seqcopy = new char [seqcopylen + 1];
+ char *defcopy = new char [128 * 1024];
+
+ seqs->first(seqsbuffer);
+
+ for (uint32 i=0; i<numseqs; i++, seqs->next(seqsbuffer)) {
+
+ // If there is no polish for the sequence, just write the whole
+ // thing out. This is a hack, so that svi will run.
+ //
+ if (lrange[i] >= hrange[i]) {
+ lrange[i] = 0;
+ hrange[i] = seqsbuffer.sequenceLength();
+ }
+
+ if (lrange[i] < hrange[i]) {
+ //seqs->seek(seqsbuffer, i);
+
+ if (seqsbuffer.sequenceLength() > seqcopylen) {
+ delete [] seqcopy;
+
+ seqcopylen = seqsbuffer.sequenceLength() + 128 * 1024;
+ seqcopy = new char [seqcopylen + 1];
+ }
+
+ for (uint32 j=0, k=lrange[i]; k<hrange[i]; j++, k++)
+ seqcopy[j] = seqsbuffer.sequence()[k];
+
+ seqcopy[hrange[i] - lrange[i]] = 0;
+
+ // Mangle the defline
+ //
+ uint32 j = 0;
+ for (j=0; !isspace(seqsbuffer.header()[j]) && j<seqsbuffer.headerLength(); j++)
+ defcopy[j] = seqsbuffer.header()[j];
+
+ defcopy[j] = 0;
+
+#if 0
+ fprintf(stdout, "%u] Trim from 0:%u to %u:%u\n",
+ i,
+ seqsbuffer.sequenceLength(),
+ lrange[i],
+ hrange[i]);
+#endif
+
+ fprintf(stdout, "%s trimmed to %u:%u\n%s\n", defcopy, lrange[i], hrange[i], seqcopy);
+ }
+
+ if ((i & 0x1ff) == 0) {
+ fprintf(stderr, "Writing trimmed sequences: %u\r", i);
+ fflush(stderr);
+ }
+ }
+
+ fprintf(stderr, "\n");
+
+ return(0);
+}
diff --git a/sim4dbutils/uniqPolishes.C b/sim4dbutils/uniqPolishes.C
new file mode 100644
index 0000000..775b6d3
--- /dev/null
+++ b/sim4dbutils/uniqPolishes.C
@@ -0,0 +1,102 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "sim4.H"
+
+
+void
+pickBest(sim4polishWriter *W, sim4polish **p, int pNum, int uniq) {
+ int i;
+
+ if (pNum == 1) {
+ if (uniq)
+ W->writeAlignment(p[0]);
+ } else {
+ if (!uniq)
+ for (i=0; i<pNum; i++)
+ W->writeAlignment(p[0]);
+ }
+
+ for (i=0; i<pNum; i++)
+ delete p[i];
+}
+
+
+
+
+int
+main(int argc, char **argv) {
+ uint32 pNum = 0;
+ uint32 pAlloc = 8388608;
+ uint32 estID = ~uint32ZERO;
+
+ uint32 uniq = 1;
+
+ sim4polishStyle style = sim4polishStyleDefault;
+
+ int arg = 1;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-uniq", 2) == 0) {
+ uniq = 1;
+ } else if (strncmp(argv[arg], "-dupl", 2) == 0) {
+ uniq = 0;
+ } else if (strcmp(argv[arg], "-gff3") == 0) {
+ style = sim4polishGFF3;
+ } else {
+ fprintf(stderr, "unknown option: %s\n", argv[arg]);
+ }
+ arg++;
+ }
+
+ if (isatty(fileno(stdin))) {
+ fprintf(stderr, "usage: %s [-uniq | -dupl] [-gff3] < file > file\n", argv[0]);
+
+ if (isatty(fileno(stdin)))
+ fprintf(stderr, "error: I cannot read polishes from the terminal!\n\n");
+
+ exit(1);
+ }
+
+ // Read polishes, picking the best when we see a change in
+ // the estID.
+
+ sim4polishWriter *W = new sim4polishWriter("-", style);
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish **p = new sim4polish * [pAlloc];
+ sim4polish *q = 0L;
+
+ if (R->getsim4polishStyle() != style)
+ fprintf(stderr, "warning: input format and output format differ.\n");
+
+ while (R->nextAlignment(q)) {
+ if ((q->_estID != estID) && (pNum > 0)) {
+ pickBest(W, p, pNum, uniq);
+ pNum = 0;
+ }
+
+ if (pNum >= pAlloc) {
+ sim4polish **P = new sim4polish * [pAlloc * 2];
+ memcpy(p, P, sizeof(sim4polish *) * pAlloc);
+ delete [] p;
+ p = P;
+ pAlloc *= 2;
+ }
+
+ p[pNum++] = q;
+ estID = q->_estID;
+
+ q = 0L; // Else we'll delete the polish we just saved!
+ }
+
+ if (pNum > 0)
+ pickBest(W, p, pNum, uniq);
+
+ delete [] p;
+ delete R;
+ delete W;
+
+ return(0);
+}
+
diff --git a/sim4dbutils/vennPolishes.C b/sim4dbutils/vennPolishes.C
new file mode 100644
index 0000000..f505019
--- /dev/null
+++ b/sim4dbutils/vennPolishes.C
@@ -0,0 +1,192 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "bio++.H"
+#include "sim4.H"
+
+
+const char *usage =
+"usage: %s [options] <polishes-file-1> <polishes-file-2> ...\n"
+"\n"
+" Given n sets of sim4 polishes (of the same set of cDNA to the same\n"
+" set of genomic, but this isn't enforced) this code will generate a\n"
+" Venn diagram of how the sequences map.\n"
+"\n"
+" -n <num-seqs> there are <num-seqs> in the input set\n"
+" -i <min-ident> filter matches to be >= <min-ident> identity\n"
+" default = 95\n"
+" -c <min-cover> filter matches to be >= <min-cover> coverage\n"
+" default = 50\n"
+" -d <class-id> dump the sequence IIDs in <class-id> to stdout\n"
+"\n"
+" -plot write a plot-able datafile of the venn diagram\n"
+" for percent identity <min-idenit> to 100 (inclusive)\n"
+" and <min-cover> coverage.\n";
+
+
+// Yes, yes. Tell me all about how bad globals are.
+uint32 minI = 95;
+uint32 minC = 50;
+uint32 foundMax = 100000;
+uint32 dumpIID = ~uint32ZERO;
+int numArgs = 0;
+bool plot = false;
+uint32 numFiles = 0;
+uint32 **found = 0L;
+uint32 indexMax = 0;
+uint32 *counts = 0L;
+uint32 *sizes = 0L;
+
+
+void
+doVenn(uint32 minI, uint32 minC) {
+
+ // Count how many elements are in each set
+ for (uint32 i=0; i<numFiles; i++) {
+ sizes[i] = 0;
+ for (uint32 j=0; j<foundMax; j++)
+ if (found[i][j] >= minI)
+ sizes[i]++;
+ }
+
+
+ for (uint32 i=0; i<indexMax; i++)
+ counts[i] = 0;
+
+
+ // For each guy in the datasets
+ //
+ for (uint32 thisguy=0; thisguy < foundMax; thisguy++) {
+
+ // Compute which class he is in. 'class' is a reserved word,
+ // so we use 'membership' instead.
+ //
+ uint32 membership = 0;
+
+ for (uint32 dataset=0; dataset < numFiles; dataset++) {
+ if (found[dataset][thisguy] >= minI)
+ membership |= 1 << dataset;
+ }
+
+ if (membership == dumpIID)
+ fprintf(stdout, uint32FMT"\n", thisguy);
+
+ counts[membership]++;
+ }
+}
+
+
+
+int
+main(int argc, char **argv) {
+
+ if ((argc < 5)) {
+ fprintf(stderr, usage, argv[0]);
+ exit(1);
+ }
+
+ int arg=1;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-n") == 0) {
+ foundMax = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-i") == 0) {
+ minI = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-c") == 0) {
+ minC = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-d") == 0) {
+ dumpIID = atoi(argv[++arg]);
+ } else if (strcmp(argv[arg], "-plot") == 0) {
+ plot = true;
+ } else {
+ // Assume we got all the options, and we are at a file.
+ //
+ numArgs = arg;
+ arg = argc;
+ }
+
+ arg++;
+ }
+
+ numFiles = argc - numArgs;
+ found = new uint32 * [numFiles];
+
+ if (numFiles > 16) {
+ fprintf(stderr, "WARNING: You gave me "uint32FMT" files! That's pretty big. I don't know\n", numFiles);
+ fprintf(stderr, " if I'm up to it. Fasten seat belts and hang on!\n");
+ }
+
+ for (int arg=numArgs; arg<argc; arg++) {
+ fprintf(stderr, "Reading '%s'\n", argv[arg]);
+
+ found[arg-numArgs] = new uint32 [foundMax];
+
+ for (uint32 i=0; i<foundMax; i++)
+ found[arg-numArgs][i] = 0;
+
+ sim4polishReader *R = new sim4polishReader("-");
+ sim4polish *p = 0L;
+
+ while (R->nextAlignment(p)) {
+ if ((p->_percentIdentity >= minI) &&
+ (p->_querySeqIdentity >= minC)) {
+
+ if (p->_estID >= foundMax) {
+ fprintf(stderr, "Please increase foundMax, or make me reallocate storage.\n");
+ exit(1);
+ }
+
+ if (found[arg-numArgs][p->_estID] < p->_percentIdentity)
+ found[arg-numArgs][p->_estID] = p->_percentIdentity;
+ }
+ }
+ }
+
+
+ // There are 2^n categories for n files.
+ //
+ // If A and B, then there is
+ //
+ // A B
+ // 0 0 - neither (we can't compute this)
+ // 0 1 - only B
+ // 1 0 - only A
+ // 1 1 - both A and B
+ //
+ // So, we make an array of size 2^n that holds the ocunts of each
+ // class. It's indexed by a bit vector.
+ //
+ indexMax = 1 << numFiles;
+ counts = new uint32 [indexMax];
+ sizes = new uint32 [numFiles];
+
+ if (dumpIID != ~uint32ZERO) {
+ doVenn(minI, minC);
+ } else if (plot) {
+ for (uint32 id=minI; id <= 100; id++) {
+ doVenn(id, minC);
+
+ fprintf(stdout, uint32FMTW(3)" ", id);
+ for (uint32 i=0; i<numFiles; i++)
+ fprintf(stdout, uint32FMTW(8)" ", sizes[i]);
+ for (uint32 index=0; index < indexMax; index++) {
+ for (uint32 dataset=0; dataset < numFiles; dataset++)
+ fprintf(stdout, "%c", (index & (1 << dataset)) ? 'A' + (char)dataset : '-');
+ fprintf(stdout, " "uint32FMTW(8)" ", counts[index]);
+ }
+ fprintf(stdout, "\n");
+ }
+ } else {
+ doVenn(minI, minC);
+
+ for (uint32 i=0; i<numFiles; i++)
+ fprintf(stdout, "%c = ("uint32FMTW(8)" total) %s\n", 'A' + (char)i, sizes[i], argv[i+numArgs]);
+
+ for (uint32 index=0; index < indexMax; index++) {
+ fprintf(stdout, uint32FMTW(4)" [", index);
+ for (uint32 dataset=0; dataset < numFiles; dataset++)
+ fprintf(stdout, "%c", (index & (1 << dataset)) ? 'A' + (char)dataset : '-');
+ fprintf(stdout, "] "uint32FMT"\n", counts[index]);
+ }
+ }
+}
diff --git a/snapper/Make.include b/snapper/Make.include
new file mode 100644
index 0000000..54e4fa4
--- /dev/null
+++ b/snapper/Make.include
@@ -0,0 +1,33 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBMERYL/ :=$(realpath $/../libmeryl/)/
+LIBKMER/ :=$(realpath $/../libkmer/)/
+LIBSIM4/ :=$(realpath $/../libsim4/)/
+
+src := $/snapper2.C \
+ $/configuration.C \
+ $/thr-search.C \
+ $/thr-filter.C \
+ $/thr-polish.C \
+ $/thr-polish-dp.C \
+ $/hitMatrix.C \
+ $/hitMatrix-sort.C \
+ $/snapper2.H
+
+
+$/.CXX_SRCS := $(filter %.C,${src})
+$/.CXX_EXES := $/snapper2
+
+$/.CLEAN :=$/*.o
+
+$(eval $/%.d $/%.o: CXXFLAGS+= -I${LIBUTL/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBMERYL/} -I${LIBKMER/} -I${LIBSIM4/})
+
+$/snapper2: ${$/.CXX_SRCS:.C=.o} \
+ ${LIBSIM4/}libsim4.a \
+ ${LIBKMER/}libkmer.a \
+ ${LIBMERYL/}libmeryl.a \
+ ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a \
+ ${LIBUTL/}libutil.a
diff --git a/snapper/configuration.C b/snapper/configuration.C
new file mode 100644
index 0000000..60b7516
--- /dev/null
+++ b/snapper/configuration.C
@@ -0,0 +1,275 @@
+#include "snapper2.H"
+
+
+
+configuration::configuration() {
+ _beVerbose = false;
+
+ _KBmerSize = 20;
+ _KBcompression = 0;
+ _KBspacingTemplate = 0L;
+
+ _merSkip = 0;
+
+ _numSearchThreads = 4;
+
+ _doReverse = true;
+ _doForward = true;
+ _doValidation = false;
+ _doValidationFileName = 0L;
+
+ _doAlignments = false;
+
+ _Lo = 0.5;
+ _Hi = 1.0;
+ _Va = 0.6;
+
+ _maxDiagonal = 25;
+
+ // Alternate match extension scheme
+ _extendWeight = 2.0;
+ _extendMinimum = 100;
+ _extendMaximum = 2000;
+
+ _repeatThreshold = 3;
+
+ _minHitLength = 0;
+ _minHitCoverage = 0.2;
+
+ _minMatchIdentity = 98;
+ _minMatchCoverage = 96;
+
+ _afEnabled = false;
+ _afThreshold = 0.25;
+ _afLength = 64;
+ _afInit = 5;
+
+ _discardExonLength = 64;
+ _discardExonQuality = 90;
+ _splitMatches = true;
+ _polishOptimally = false;
+
+ _dbFileName = 0L;
+ _psFileName = 0L;
+ _qsFileName = 0L;
+
+ _maskFileName = 0L;
+ _onlyFileName = 0L;
+
+ _ignoreThreshold = 0;
+
+ _maskPrefix = 0L;
+ _maskThreshold = 0;
+ _onlyPrefix = 0L;
+ _onlyThreshold = 0;
+
+ _outputFileName = 0L;
+ _logmsgFileName = 0L;
+ _statsFileName = 0L;
+
+ _buildOnly = false;
+}
+
+configuration::~configuration() {
+}
+
+
+
+
+void
+configuration::read(int argc, char **argv) {
+
+ int arg = 1;
+ int err = 0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-mersize") == 0) {
+ _KBmerSize = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-merskip") == 0) {
+ _merSkip = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-compression") == 0) {
+ _KBcompression = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-template") == 0) {
+ _KBspacingTemplate = argv[++arg];
+ } else if (strcmp(argv[arg], "-numthreads") == 0) {
+ _numSearchThreads = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-ignore") == 0) {
+ _ignoreThreshold = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-mask") == 0) {
+ _maskFileName = argv[++arg];
+ } else if (strcmp(argv[arg], "-only") == 0) {
+ _onlyFileName = argv[++arg];
+ } else if (strcmp(argv[arg], "-maskn") == 0) {
+ _maskPrefix = argv[++arg];
+ _maskThreshold = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-onlyn") == 0) {
+ _onlyPrefix = argv[++arg];
+ _onlyThreshold = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-queries") == 0) {
+ _qsFileName = argv[++arg];
+ } else if (strcmp(argv[arg], "-genomic") == 0) {
+ _dbFileName = argv[++arg];
+ } else if (strcmp(argv[arg], "-positions") == 0) {
+ _psFileName = argv[++arg];
+ } else if (strcmp(argv[arg], "-buildonly") == 0) {
+ _buildOnly = argv[arg];
+ } else if (strcmp(argv[arg], "-forward") == 0) {
+ _doForward = true;
+ _doReverse = false;
+ } else if (strcmp(argv[arg], "-reverse") == 0) {
+ _doReverse = true;
+ _doForward = false;
+ } else if (strcmp(argv[arg], "-validate") == 0) {
+ _doValidation = true;
+ _doValidationFileName = argv[++arg];
+ } else if ((strcmp(argv[arg], "-setfilter") == 0) ||
+ (strcmp(argv[arg], "-lhv") == 0) ||
+ (strcmp(argv[arg], "-LHV") == 0)) {
+ _Lo = atof(argv[++arg]);
+ _Hi = atof(argv[++arg]);
+ _Va = atof(argv[++arg]);
+ } else if (strcmp(argv[arg], "-verbose") == 0) {
+ _beVerbose = true;
+ } else if (strcmp(argv[arg], "-output") == 0) {
+ _outputFileName = argv[++arg];
+ } else if (strcmp(argv[arg], "-aligns") == 0) {
+ _doAlignments = true;
+ } else if (strcmp(argv[arg], "-noaligns") == 0) {
+ _doAlignments = false;
+ } else if (strcmp(argv[arg], "-log") == 0) {
+ _logmsgFileName = argv[++arg];
+ } else if (strcmp(argv[arg], "-stats") == 0) {
+ _statsFileName = argv[++arg];
+ } else if (strcmp(argv[arg], "-dp") == 0) {
+ _polishOptimally = true;
+ } else if (strcmp(argv[arg], "-maxdiagonal") == 0) {
+ _maxDiagonal = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-minhitlength") == 0) {
+ _minHitLength = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-minhitcoverage") == 0) {
+ _minHitCoverage = atof(argv[++arg]);
+ } else if (strcmp(argv[arg], "-minmatchidentity") == 0) {
+ _minMatchIdentity = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-minmatchcoverage") == 0) {
+ _minMatchCoverage = strtouint32(argv[++arg], 0L);
+
+ } else if (strcmp(argv[arg], "-af") == 0) {
+ _afEnabled = true;
+ } else if (strcmp(argv[arg], "-afthreshold") == 0) {
+ _afThreshold = atof(argv[++arg]);
+ _afEnabled = true;
+ } else if (strcmp(argv[arg], "-aflength") == 0) {
+ _afLength = strtouint32(argv[++arg], 0L);
+ _afEnabled = true;
+ } else if (strcmp(argv[arg], "-afinit") == 0) {
+ _afInit = strtouint32(argv[++arg], 0L);
+ _afEnabled = true;
+
+ } else if (strcmp(argv[arg], "-discardexonlength") == 0) {
+ _discardExonLength = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-discardexonquality") == 0) {
+ _discardExonQuality = strtouint32(argv[++arg], 0L);
+ } else if (strncmp(argv[arg], "-extendweight", 8) == 0) {
+ _extendWeight = atof(argv[++arg]);
+ } else if (strncmp(argv[arg], "-extendminimum", 8) == 0) {
+ _extendMinimum = strtouint32(argv[++arg], 0L);
+ } else if (strncmp(argv[arg], "-extendmaximum", 8) == 0) {
+ _extendMaximum = strtouint32(argv[++arg], 0L);
+ } else if (strncmp(argv[arg], "-repeatthreshold", 8) == 0) {
+ _repeatThreshold = strtouint32(argv[++arg], 0L);
+
+ } else {
+ fprintf(stderr, "Unknown option '%s'\n", argv[arg]);
+ err++;
+ }
+
+ arg++;
+ }
+
+ //
+ // Make sure some constraints are met
+ //
+
+ if (_maskFileName && _onlyFileName)
+ fprintf(stderr, "ERROR: At most one of -mask and -only may be used.\n"), err++;
+
+ if (_merSkip >= _KBmerSize)
+ fprintf(stderr, "ERROR: Mers are not adjacent; make sure merskip <= mersize.\n"), err++;
+
+ if ((_KBcompression) || (_KBspacingTemplate))
+ fprintf(stderr, "ERROR: Mer compression and spacing not supported right now. :-(\n"), err++;
+
+ if ((_afThreshold < 0) || (_afThreshold > 1.0))
+ fprintf(stderr, "ERROR: Invalid afThreshold %f, should be 0.0 <= t <= 1.0\n", _afThreshold), err++;
+
+ if (64 < _afLength)
+ fprintf(stderr, "ERROR: Invalid afLength "uint32FMT", should be < 64.\n", _afLength), err++;
+
+ if ((_qsFileName == 0L) && (_buildOnly == false))
+ fprintf(stderr, "ERROR: No query file supplied.\n"), err++;
+
+ if (_dbFileName == 0L)
+ fprintf(stderr, "ERROR: No genome file supplied.\n"), err++;
+
+ //
+ // Be helpful.
+ //
+
+ if (err) {
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Algorithm Options:\n");
+ fprintf(stderr, " -forward Search only the normal cDNA.\n");
+ fprintf(stderr, " -reverse Search only the reverse-complement cDNA.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -mersize k Use k-mers.\n");
+ fprintf(stderr, " -merskip l Skip l mers between.\n");
+ fprintf(stderr, " -compression c Compress homopolymer runs to c letters.\n");
+ fprintf(stderr, " -template t Use spaced seed template t.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -dp Optimially polish (broken)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -maxdiagonal d Maximum diagonal gap within a hit (25).\n");
+ fprintf(stderr, " -minhitlength l Minimum length for a hit to be polished (0).\n");
+ fprintf(stderr, " -minhitcoverage c Minimum coverage for a hit to be polished (0.2, 0.0 to 1.0).\n");
+ fprintf(stderr, " -minmatchidentity i Minimum percent identity for matches (98, integer).\n");
+ fprintf(stderr, " -minmatchcoverage c Minimum coverage for matches (96, integer).\n");
+ fprintf(stderr, " -discardexonlength l Discard exons less than l bp long (64).\n");
+ fprintf(stderr, " -discardexonquality p Discard exons less than p percent identity (90).\n");
+ fprintf(stderr, " -extendweight w For each unhit base, extend by this much (2).\n");
+ fprintf(stderr, " -extendminimum e Extend hits by at least this much (100).\n");
+ fprintf(stderr, " -extendmaximum e Extend hits by at most this much (2000).\n");
+ fprintf(stderr, " -repeatthreshold t Tune hits to expect t local repeat count (3).\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Filter and Filter Validation:\n");
+ fprintf(stderr, " -setfilter L H V Use { L,H,V } as the filter parameters.\n");
+ fprintf(stderr, " -validate Enable tuning of the filter (expensive!).\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Masking Options:\n");
+ fprintf(stderr, " -ignore n Ignore mers with count more than n.\n");
+ fprintf(stderr, " -mask f Ignore (only use) all mers listed in file f.\n");
+ fprintf(stderr, " -only f\n");
+ fprintf(stderr, " -maskn f n Ignore (only use) the mers listed in meryl prefix f.\n");
+ fprintf(stderr, " -onlyn f n For mask, mers with count >= n are masked.\n");
+ fprintf(stderr, " For only, mers with count <= n are used.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Input Options:\n");
+ fprintf(stderr, " -queries c.fasta Query sequences.\n");
+ fprintf(stderr, " -genomic g.fasta Database sequences.\n");
+ fprintf(stderr, " -positions p.positionDB Build and save / use positionDB. Assumes you aren't using -use.\n");
+ fprintf(stderr, " -buildonly Only do the build and save.\n");
+ fprintf(stderr, " -use [...]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Process Options:\n");
+ fprintf(stderr, " -numthreads n Use n search threads.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Output Options:\n");
+ fprintf(stderr, " -verbose Entertain the user with useless statistics.\n");
+ fprintf(stderr, " -output f Write output to file f.\n");
+ fprintf(stderr, " -{no}aligns Enable/Disable full alignments. Enabled by default.\n");
+ fprintf(stderr, " -log f Write some debugging/logging information to file f. This\n");
+ fprintf(stderr, " is mostly for developers, and does NOT provide useful\n");
+ fprintf(stderr, " information unless you know the guts of snapper.\n");
+ fprintf(stderr, " -stats f Write resource usage statistics to f.\n");
+ exit(1);
+ }
+}
diff --git a/snapper/eval/domap14.sh b/snapper/eval/domap14.sh
new file mode 100644
index 0000000..7adc3a3
--- /dev/null
+++ b/snapper/eval/domap14.sh
@@ -0,0 +1,91 @@
+#!/bin/sh
+#$ -p -666
+#$ -j y
+#$ -o domap14.$TASK_ID.out
+#$ -cwd
+#$ -N snapper2test
+#$ -A snapper2test
+
+# gn -- pick one of 1 different genome sequences
+# fg -- pick one of 8 different fragment sets
+# ms -- fixed at 28
+# mk -- pick one of 19 different mer skips
+# ig -- pick one of 14 different mer thresholds
+#
+# input sequences are assumed to be in the workdirectory below, and all
+# output ends up there.
+
+workd=/project/huref0/assembly/chr14
+
+decodeJob() {
+ nn=$1
+
+ xx=`expr $nn % 1 + 1`
+ gn=`echo chr14 | cut -d' ' -f $xx`
+ nn=`expr $nn / 1`
+
+ xx=`expr $nn % 8 + 1`
+ fg=`echo f1 f2 f3 f4 f5 f6 f7 f8 | cut -d' ' -f $xx`
+ nn=`expr $nn / 8`
+
+ xx=`expr $nn % 1 + 1`
+ ms=`echo 28 | cut -d' ' -f $xx`
+ nn=`expr $nn / 1`
+
+ xx=`expr $nn % 19 + 1`
+ mk=`echo 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 | cut -d' ' -f $xx`
+ nn=`expr $nn / 19`
+
+ xx=`expr $nn % 14 + 1`
+ ig=`echo 0000 0001 0002 0004 0008 0016 0032 0064 0128 0256 0512 1024 2048 4096 | cut -d' ' -f $xx`
+ nn=`expr $nn / 14`
+
+ name=`echo $gn.$fg.ms$ms.mk$mk.ig$ig`
+}
+
+
+
+if [ x$1 = "xcheck" ] ; then
+ shift
+ mm=1
+ jj=`expr 1 \* 8 \* 1 \* 19 \* 14`
+ echo "Checking $jj jobs."
+ while [ $mm -lt $jj ] ; do
+ decodeJob $mm
+ if [ ! -e $workd/$name.stats ] ; then
+ echo qsub -t $mm domap14.sh
+ fi
+ mm=`expr $mm + 1`
+ done
+ exit
+fi
+
+
+
+nn=`expr $SGE_TASK_ID - 1`
+decodeJob $nn
+
+if [ ! -e $workd/$name.stats ] ; then
+ $workd/src/genomics/snapper/snapper2 \
+ -verbose \
+ -queries $workd/$fg.fasta \
+ -genomic $workd/$gn.fasta \
+ -mersize $ms \
+ -merskip $mk \
+ -ignore $ig \
+ -minmatchidentity 95 \
+ -minmatchcoverage 50 \
+ -numthreads 2 \
+ -noaligns \
+ -output /scratch/$name.sim4db \
+ -validate /scratch/$name.validate \
+ -stats /scratch/$name.stats \
+ && \
+ bzip2 -9v /scratch/$name.sim4db \
+ && \
+ mv /scratch/$name.sim4db.bz2 $workd/$name.sim4db.bz2 \
+ && \
+ mv /scratch/$name.validate $workd/$name.validate \
+ && \
+ mv /scratch/$name.stats $workd/$name.stats
+fi
diff --git a/snapper/eval/filter-validate.pl b/snapper/eval/filter-validate.pl
new file mode 100644
index 0000000..2d6c00c
--- /dev/null
+++ b/snapper/eval/filter-validate.pl
@@ -0,0 +1,60 @@
+#!/bin/perl
+
+#
+# Read in the output of a snapper2 validation run, print the highest
+# specificity for each distinct sensitivity.
+#
+
+my $numToShow = 10;
+if (! -e $ARGV[0]) {
+ $numToShow = shift @ARGV;
+}
+
+my $hdrshown = 0;
+
+foreach my $file (@ARGV) {
+ my %spec;
+ my %line;
+
+ open(F, "< $file");
+ my $hdr = <F>;
+ chomp $hdr;
+
+ while (<F>) {
+ chomp;
+
+ my @vals = split '\s+', $_;
+
+ # 3 -> sensitivity
+ # 4 -> specificity
+
+ if ($spec{$vals[3]} < $vals[4]) {
+ $spec{$vals[3]} = $vals[4];
+ $line{$vals[3]} = $_;
+ }
+ }
+
+ close(F);
+
+# print "\n$file\n $hdr\n";
+# my @sortedK = sort { $b <=> $a } keys %spec;
+# $#sortedK = $numToShow - 1;
+# foreach my $k (@sortedK) {
+# print "$k $spec{$k} -- $line{$k}\n";
+# }
+
+ if ($hdrshown == 0) {
+ print " $hdr\n";
+ $hdrshown = 1;
+ }
+
+ $file = substr("$file ", 0, 40);
+ my @sortedK = sort { $b <=> $a } keys %spec;
+ $#sortedK = $numToShow - 1;
+ foreach my $k (@sortedK) {
+ printf "$file$k $spec{$k} -- $line{$k}\n";
+ }
+
+ undef @sortedK;
+ undef %spec;
+}
diff --git a/snapper/hitMatrix-sort.C b/snapper/hitMatrix-sort.C
new file mode 100644
index 0000000..48483a4
--- /dev/null
+++ b/snapper/hitMatrix-sort.C
@@ -0,0 +1,57 @@
+#include "snapper2.H"
+
+// Sort by dsPos
+
+inline
+void
+adjustHeap_dsPos(diagonalLine *L, uint32 p, uint32 n) {
+ uint64 v = L[p].all;
+ uint64 d = L[p].val.dPos;
+ uint32 c = (p << 1) + 1; // let c be the left child of p
+
+ while (c < n) {
+
+ // Find the larger of the two children
+ //
+ if ((c+1 < n) && (L[c].val.dPos < L[c+1].val.dPos))
+ c++;
+
+ // Does the node in question fit here?
+ //
+ if (d >= L[c].val.dPos)
+ break;
+
+ // Else, swap the parent and the child
+ //
+ L[p].all = L[c].all;
+
+ // Move down the tree
+ //
+ p = c;
+ c = (p << 1) + 1;
+ }
+
+ L[p].all = v;
+}
+
+void
+hitMatrix::sort_dsPos(void) {
+
+ if (_hitsLen > 1) {
+
+ // Create the heap of lines.
+ //
+ for (uint32 i=_hitsLen/2; i--; )
+ adjustHeap_dsPos(_hits, i, _hitsLen);
+
+ // Interchange the new maximum with the element at the end of the tree
+ //
+ for (uint32 i=_hitsLen-1; i>0; i--) {
+ uint64 v = _hits[i].all;
+ _hits[i].all = _hits[0].all;
+ _hits[0].all = v;
+
+ adjustHeap_dsPos(_hits, 0, i);
+ }
+ }
+}
diff --git a/snapper/hitMatrix.C b/snapper/hitMatrix.C
new file mode 100644
index 0000000..933c652
--- /dev/null
+++ b/snapper/hitMatrix.C
@@ -0,0 +1,426 @@
+#include "snapper2.H"
+
+#define MINCOUNT 3
+
+hitMatrix::hitMatrix(uint32 qsLen, uint32 qsMers, uint32 qsIdx, logMsg *theLog) {
+ _qsLen = qsLen;
+ _qsMers = qsMers;
+ _qsIdx = qsIdx;
+
+ _hitsLen = 0;
+ _hitsMax = 8;
+ _hits = new diagonalLine [_hitsMax];
+
+ _matches = 0L;
+
+ _theLog = theLog;
+}
+
+hitMatrix::~hitMatrix() {
+ delete [] _hits;
+}
+
+
+void
+hitMatrix::addMatch(uint32 isunique,
+ uint32 qsLo,
+ uint32 qsHi,
+ uint32 dsLo,
+ uint32 dsHi,
+ merCovering *IL,
+ merList *ML) {
+ uint32 offset = 0;
+
+ offset = (uint32)(config._extendWeight * qsLo);
+ if (offset < config._extendMinimum)
+ offset = config._extendMinimum;
+ if (offset > config._extendMaximum)
+ offset = config._extendMaximum;
+ if (dsLo < offset)
+ dsLo = 0;
+ else
+ dsLo -= offset;
+
+ offset = (uint32)(config._extendWeight * (_qsLen - qsHi));
+ if (offset < config._extendMinimum)
+ offset = config._extendMinimum;
+ if (offset > config._extendMaximum)
+ offset = config._extendMaximum;
+ dsHi += offset;
+
+
+ // Create a new match
+ //
+ // n = new match
+ // m = current match
+ // l = last match
+ //
+ trapMatch *n = new trapMatch(isunique, qsLo, qsHi, dsLo, dsHi, IL, ML);
+
+#ifdef SHOW_HITMATRIX
+ _theLog->add("chained: Q::"uint32FMT"-"uint32FMT"("uint32FMT") G::"uint32FMT"-"uint32FMT"("uint32FMT")\n",
+ qsLo, qsHi, qsHi - qsLo,
+ dsLo, dsHi, dsHi - dsLo);
+#endif
+
+ // And find a home for it in the list. No merging of matches is done here. It's
+ // too hard.
+ //
+ if ((_matches == 0L) || (n->_dsHi > _matches->_dsHi)) {
+ n->_next = _matches;
+ _matches = n;
+ } else {
+ trapMatch *l = _matches;
+ trapMatch *m = _matches->_next;
+
+ while ((m) && (n->_dsHi < m->_dsHi)) {
+ l = m;
+ m = m->_next;
+ }
+
+ n->_next = m;
+ l->_next = n;
+ }
+}
+
+
+// Utility for sorting the diagonal lines in the hitMatrix
+//
+// The two comparison functions return true if the first line
+// is less than the second line.
+
+inline
+int
+compareLines(diagonalLine *A, diagonalLine *B, uint32 qsLen) {
+ uint32 a = qsLen - A->val.qPos - 1 + A->val.dPos;
+ uint32 b = qsLen - B->val.qPos - 1 + B->val.dPos;
+
+ return(((a < b)) ||
+ ((a == b) && (A->val.qPos < B->val.qPos)));
+}
+
+inline
+int
+compareLines(uint32 l, uint32 q, diagonalLine *B, uint32 qsLen) {
+ uint32 b = qsLen - B->val.qPos - 1 + B->val.dPos;
+
+ return(((l < b)) ||
+ ((l == b) && (q < B->val.qPos)));
+}
+
+inline
+void
+adjustHeap(diagonalLine *L, int32 p, int32 n, uint32 qsLen) {
+ uint64 v = L[p].all;
+ uint32 q = L[p].val.qPos;
+ uint32 l = qsLen - q - 1 + L[p].val.dPos;
+ int32 c = (p << 1) + 1; // let c be the left child of p
+
+ while (c < n) {
+
+ // Find the larger of the two children
+ //
+ if ((c+1 < n) && compareLines(L+c, L+c+1, qsLen))
+ c++;
+
+ // Does the node in question fit here?
+ //
+ if (compareLines(l, q, L+c, qsLen) == false)
+ break;
+
+ // Else, swap the parent and the child
+ //
+ L[p].all = L[c].all;
+
+ // Move down the tree
+ //
+ p = c;
+ c = (p << 1) + 1;
+ }
+
+ L[p].all = v;
+}
+
+
+
+
+void
+hitMatrix::filter(char direction,
+ double minHitCoverage,
+ uint32 minHitLength,
+ aHit *&theHits,
+ uint32 &theHitsPos,
+ uint32 &theHitsMax) {
+
+ if (_hitsLen == 0)
+ return;
+
+ // Decide on the minimum quality values; we pick the larger of
+ // the fixed lengths, and the sequence length * coverage.
+ //
+ uint32 minLength = (uint32)(minHitCoverage * _qsLen);
+ if (minLength < minHitLength)
+ minLength = minHitLength;
+
+ // First, sort by the dsPos. This is done so that we can find all the hits for
+ // a specific scaffold.
+ //
+ sort_dsPos();
+
+ // Now, while there are hits left....
+ //
+ uint32 firstHit = 0;
+ uint32 lastHit = 0;
+ uint32 currentSeq = 0;
+
+ //
+ // Step 1: Sort the mer-hits, chain, promote decent ones to matches
+ //
+
+ while (firstHit < _hitsLen) {
+
+ // Move the currentSeq until the firstHit is below it. After
+ // this loop, currentSeq is the sequence AFTER the one that we
+ // want hits in.
+ //
+ while ((currentSeq < genomeMap->numberOfSequences()) &&
+ (genomeMap->startOf(currentSeq) <= _hits[firstHit].val.dPos))
+ currentSeq++;
+
+ // Find the first hit that is in currentSeq. If this is the last sequence,
+ // then, of course, all remaining hits are in it.
+ //
+ if (currentSeq < genomeMap->numberOfSequences()) {
+ lastHit = firstHit + 1;
+ while ((lastHit < _hitsLen) &&
+ (_hits[lastHit].val.dPos < genomeMap->startOf(currentSeq)))
+ lastHit++;
+ } else {
+ lastHit = _hitsLen;
+ }
+
+ // Drop back one sequence; this is the sequence the hits are in.
+ //
+ currentSeq--;
+
+ // Adjust the hits to be relative to the start of this sequence
+ //
+ for (uint32 i=firstHit; i<lastHit; i++)
+ _hits[i].val.dPos -= genomeMap->startOf(currentSeq);
+
+ // Sort them, if needed.
+ //
+ if (lastHit - firstHit > 1) {
+
+ // We cheat; heapsort isn't too friendly to sorting the middle of
+ // an array, so we make a new array in the middle!
+ //
+ diagonalLine *hitsToSort = _hits + firstHit;
+
+ // Build the heap. I initially thought this could be done at the
+ // same time as the scan for the last hit, but it can't (easily)
+ //
+ for (int32 i=(lastHit - firstHit)/2 - 1; i>=0; i--)
+ adjustHeap(hitsToSort, i, lastHit - firstHit, _qsLen);
+
+ // Sort the hits by diagonal. This is the second part of
+ // heap sort -- Interchange the new maximum with the element
+ // at the end of the tree
+ //
+ for (uint32 i=lastHit - firstHit - 1; i>0; i--) {
+ uint64 v = hitsToSort[i].all;
+ hitsToSort[i].all = hitsToSort[0].all;
+ hitsToSort[0].all = v;
+
+ adjustHeap(hitsToSort, 0, i, _qsLen);
+ }
+ }
+
+ // Filter them
+ //
+ uint32 frstDiagonal = _qsLen - _hits[firstHit].val.qPos - 1 + _hits[firstHit].val.dPos;
+ uint32 lastDiagonal = frstDiagonal;
+ uint32 unique = uint32ZERO;
+ uint32 qsLow = _hits[firstHit].val.qPos;
+ uint32 qsHigh = _hits[firstHit].val.qPos;
+ uint32 dsLow = _hits[firstHit].val.dPos;
+ uint32 dsHigh = _hits[firstHit].val.dPos;
+ uint32 minCount = ~uint32ZERO;
+
+ merCovering *IL = new merCovering(config._KBmerSize);
+ merList *ML = new merList();
+
+ for (uint32 i=firstHit; i<lastHit; i++) {
+ uint32 thisDiagonalID = _qsLen - _hits[i].val.qPos - 1 + _hits[i].val.dPos;
+
+ // Unconditionally extend if the diagonal difference is small.
+ //
+ if (lastDiagonal + config._maxDiagonal >= thisDiagonalID) {
+ lastDiagonal = thisDiagonalID;
+ if (qsLow > _hits[i].val.qPos) qsLow = _hits[i].val.qPos;
+ if (qsHigh < _hits[i].val.qPos) qsHigh = _hits[i].val.qPos;
+ if (dsLow > _hits[i].val.dPos) dsLow = _hits[i].val.dPos;
+ if (dsHigh < _hits[i].val.dPos) dsHigh = _hits[i].val.dPos;
+ if (minCount > _hits[i].val.uniq) minCount = _hits[i].val.uniq;
+ IL->addMer(_hits[i].val.qPos);
+ ML->addMer(_hits[i].val.qPos, _hits[i].val.dPos);
+ continue;
+ }
+
+ // Doesn't look like these hits belong together. Promote the hit
+ // to a match if it's decent.
+
+ IL->merge();
+
+ if ((minCount <= MINCOUNT) || (minLength <= IL->sumOfLengths())) {
+ addMatch(minCount <= MINCOUNT,
+ qsLow,
+ qsHigh + config._KBmerSize,
+ dsLow,
+ dsHigh + config._KBmerSize,
+ IL,
+ ML);
+ IL = new merCovering(config._KBmerSize);
+ ML = new merList();
+ } else {
+ IL->clear();
+ ML->clear();
+ }
+
+ frstDiagonal = thisDiagonalID;
+ lastDiagonal = thisDiagonalID;
+ qsLow = _hits[i].val.qPos;
+ qsHigh = _hits[i].val.qPos;
+ dsLow = _hits[i].val.dPos;
+ dsHigh = _hits[i].val.dPos;
+ minCount = _hits[i].val.uniq;
+
+ IL->addMer(_hits[i].val.qPos);
+ ML->addMer(_hits[i].val.qPos, _hits[i].val.dPos);
+ }
+
+ // Save the final cluster?
+
+ IL->merge();
+
+ if ((minCount <= MINCOUNT) || (minLength <= IL->sumOfLengths())) {
+ addMatch(minCount <= MINCOUNT,
+ qsLow,
+ qsHigh + config._KBmerSize,
+ dsLow,
+ dsHigh + config._KBmerSize,
+ IL,
+ ML);
+ } else {
+ delete IL;
+ delete ML;
+ }
+
+
+ //
+ // Step 2: Merge matches into, sigh, hits, stuff them into the output
+ //
+
+
+ while (_matches) {
+
+ // Save the current match, then delete it.
+ //
+ unique = _matches->_unique;
+ dsLow = _matches->_dsLo;
+ dsHigh = _matches->_dsHi;
+ IL = _matches->_IL;
+ ML = _matches->_ML;
+
+ {
+ trapMatch *n = _matches;
+ _matches = _matches->_next;
+ delete n;
+ }
+
+ // Assimilate as many of the remaining matches as possible.
+ //
+ // Think of this as first reversing the list, then merging as
+ // long as (dsHigh + 1000 > _matches->_dsLo). But since we
+ // don't reverse the list, we can map:
+ // dsHigh --> _matches->dsHi
+ // _matches->_dsLo --> dsLow
+ // where dsHigh and dsLow are the values for the extended match.
+ //
+ while (_matches && (dsLow < _matches->_dsHi + 5000)) {
+
+ // Combine the two merCoverings
+ //
+ IL->merge(_matches->_IL);
+ ML->merge(_matches->_ML);
+
+ delete _matches->_IL;
+ delete _matches->_ML;
+
+ // The start of the new match might be after the start of the
+ // merged region. (Only rarely is it before)
+ //
+ // The end of current match is always greater than the end of the
+ // new match!
+ //
+ if (dsLow > _matches->_dsLo)
+ dsLow = _matches->_dsLo;
+
+ unique |= _matches->_unique;
+
+ {
+ trapMatch *n = _matches;
+ _matches = _matches->_next;
+ delete n;
+ }
+ }
+
+ if (theHitsPos >= theHitsMax) {
+ theHitsMax <<= 1;
+ aHit *o = 0L;
+ try {
+ o = new aHit [theHitsMax];
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "hitMatrix::filter()-- caught std::bad_alloc in %s at line %d\n", __FILE__, __LINE__);
+ fprintf(stderr, "hitMatrix::filter()-- tried to extend output string from "uint32FMT" to "uint32FMT".\n", theHitsPos, theHitsMax);
+ exit(1);
+ }
+ memcpy(o, theHits, theHitsPos * sizeof(aHit));
+ delete [] theHits;
+ theHits = o;
+ }
+
+ IL->merge();
+
+ aHit *a = theHits + theHitsPos++;
+
+ a->_status = (direction == 'f');
+ a->_status |= (unique ? AHIT_HAS_UNIQUE : 0);
+ a->_qsIdx = _qsIdx;
+ a->_dsIdx = genomeMap->IIDOf(currentSeq);
+ a->_dsLo = dsLow;
+ a->_dsHi = dsHigh;
+ a->_covered = IL->sumOfLengths();
+ a->_matched = IL->numberOfPieces(); //numberOfIntervals();
+ a->_numMers = _qsMers;
+ a->_ML = ML;
+
+ assert(a->_dsLo < a->_dsHi);
+
+#ifdef SHOW_HITMATRIX
+ _theLog->add("merged: G::"uint32FMT"-"uint32FMT"("uint32FMT") q:"uint32FMT" g:"uint32FMT" cov:"uint32FMT" mat:"uint32FMT" mer:"uint32FMT"\n",
+ a->_dsLo, a->_dsHi, a->_dsHi - a->_dsLo,
+ a->_qsIdx,
+ a->_dsIdx,
+ a->_covered, a->_matched, a->_numMers);
+#endif
+
+ delete IL;
+ }
+
+ // All done with these hits. Move to the next set.
+ //
+ firstHit = lastHit;
+ }
+}
+
diff --git a/snapper/snapper2-sge.pl b/snapper/snapper2-sge.pl
new file mode 100644
index 0000000..7c2f7e7
--- /dev/null
+++ b/snapper/snapper2-sge.pl
@@ -0,0 +1,171 @@
+#!/usr/bin/perl
+
+# Runs snapper2 on SGE, splitting both the genome and query sequences.
+
+use FindBin;
+
+my $genome = "";
+my $query = "";
+my $dir = "";
+my $mask = 1000;
+my $gseg = 32;
+my $qseg = 32;
+my $check = undef;
+
+my $bin = "$FindBin::Bin";
+
+while (scalar(@ARGV) > 0) {
+ $arg = shift @ARGV;
+
+ if ($arg =~ m/^-genome/) {
+ $genome = shift @ARGV;
+ } elsif ($arg =~ m/^-query/) {
+ $query = shift @ARGV;
+ } elsif ($arg =~ m/^-dir/) {
+ $dir = shift @ARGV;
+ } elsif ($arg =~ m/^-mask/) {
+ $mask = shift @ARGV;
+ } elsif ($arg =~ m/^-gseg/) {
+ $gseg = shift @ARGV;
+ } elsif ($arg =~ m/^-qseg/) {
+ $qseg = shift @ARGV;
+ } elsif ($arg =~ m/^-check/) {
+ $check = 1;
+ } else {
+ print STDERR "unknown option '$arg'\n";
+ }
+}
+
+# If we're checking the results, assume we are in the correct dir,
+# that the gen and qry dirs exist.
+#
+if (defined($check)) {
+ my ($gen, $qry) = countSequences($dir);
+
+ for (my $g=1; $g<=$gen; $g++) {
+ for (my $q=1; $q<=$qry; $q++) {
+ $g = substr("000$g", -3);
+ $q = substr("000$q", -3);
+
+ if (0) {
+ print STDERR "Why am I doing this? I'm supposed to be checking overlap, not snapper....\n";
+ exit(1);
+ }
+ }
+ }
+}
+
+
+
+
+if (!defined($genome) || !defined($query) || !defined($dir)) {
+ print STDERR "usage: $0 [arg]\n";
+ print STDERR " -genome x.fasta\n";
+ print STDERR " -query q.fasta\n";
+ print STDERR " -dir /path/to/work\n";
+ print STDERR " -mask kmer-limit (def: 1000)\n";
+ print STDERR " -gseg gseg (def: 16 segs, see leaff for format\n";
+ print STDERR " -qseg qseg (def: 16 segs, see leaff for format\n";
+ print STDERR " -check (check a run, assume we are in the /path/to/work\n";
+ exit(1);
+}
+
+die "Can't find genome '$genome'\n" if (! -e $genome);
+die "Can't find queries '$query'\n" if (! -e $query);
+
+system("mkdir $dir") if (! -d $dir);
+die "Can't find '$dir'\n" if (! -d $dir);
+
+if (! -e "$dir/gen/gen.partitioned") {
+ system("mkdir $dir/gen") if (! -d "$dir/gen");
+ system("$bin/leaff -F $genome --partition $dir/gen/gen $gseg");
+ open(F, "> $dir/gen/gen.partitioned");
+ close(F);
+}
+
+if (! -e "$dir/qry/qry.partitioned") {
+ system("mkdir $dir/qry") if (! -d "$dir/qry");
+ system("$bin/leaff -F $query --partition $dir/qry/qry $qseg");
+ open(F, "> $dir/qry/qry.partitioned");
+ close(F);
+}
+
+# Build indexes for everyone -- this prevents the grid jobs from
+# racing to build them. And it lets us count how many jobs to
+# submit.
+#
+my ($gen, $qry) = countSequences($dir);
+
+open(F, "> $dir/run.sh");
+print F "#!/bin/sh\n";
+print F "PIECE=`expr \$SGE_TASK_ID - 1`\n";
+print F "GPIECE=`expr \$PIECE % $gen + 1`\n";
+print F "QPIECE=`expr \$PIECE / $gen + 1`\n";
+print F "GPIECE=`printf %03d \$GPIECE`\n";
+print F "QPIECE=`printf %03d \$QPIECE`\n";
+print F "scratchname=/scratch/\$\$-\$GPIECE-\$QPIECE\n";
+print F "\n";
+print F "ulimit -c 0\n";
+print F "#rm /scratch/[0-9]*-[0-9]*-[0-9]*\n";
+print F "#echo $GPIECE $QPIECE $PIECE\n";
+print F "\n";
+print F "if [ -e $dir/map-gen$GPIECE-qlt$QPIECE.success ] ; then\n";
+print F " echo map-gen$GPIECE-qlt$QPIECE already done\n";
+print F " exit\n";
+print F "fi\n";
+print F "\n";
+print F "$bin/snapper2 -verbose \\\n";
+print F " -mersize 22 -merskip 0 \\\n";
+print F " -minhitlength 22 -minhitcoverage 0.0 \\\n";
+#print F " -setfilter 0.1500 0.1500 0.2500 \\\n";
+print F " -validate $dir/map-gen\$GPIECE-qlt\$QPIECE.validate \\\n";
+print F " -genomic $dir/gen/gen-\$GPIECE.fasta \\\n";
+print F " -queries $dir/qry/qry-\$QPIECE.fasta \\\n";
+print F " -ignore $mask \\\n";
+#print F " -output \$scratchname \\\n";
+print F " -noaligns \\\n";
+print F " -numthreads 2 \\\n";
+print F " -minmatchidentity 90 \\\n";
+print F " -minmatchcoverage 4 \\\n";
+print F " -loaderhighwatermark 1024 \\\n";
+print F "| \\\n";
+print F "bzip2 -9vc > \$scratchname.bz2 \\\n";
+print F "&& \\\n";
+print F "mv \$scratchname.bz2 $dir/map-gen\$GPIECE-qlt\$QPIECE.sim4db.bz2 \\\n";
+print F "&& \\\n";
+print F "touch $dir/map-gen\$GPIECE-qlt\$QPIECE.success\n";
+close(F);
+
+my $numJobs = $gen * $qry;
+
+print STDOUT "qsub -pe thread 2 -t 1-$numJobs -p -50 -j y -o $dir/map-\\\$TASK_ID $dir/run.sh\n";
+
+
+
+
+
+sub countSequences {
+ my $dir = shift @_;
+ my $gen = 0;
+ my $qry = 0;
+
+ open(F, "ls $dir/gen/gen-*.fasta $dir/qry/qry-*.fasta |");
+ while (<F>) {
+ chomp;
+
+ if (! -e "${_}idx") {
+ system("$bin/leaff -F $_");
+ }
+
+ if (m/\/gen-\d\d\d.fasta$/) {
+ $gen++;
+ } elsif (m/\/qry-\d\d\d.fasta$/) {
+ $qry++;
+ } else {
+ print STDERR "ERROR: Unknown file '$_'\n";
+ }
+ }
+ close(F);
+
+ return($gen, $qry);
+}
diff --git a/snapper/snapper2.C b/snapper/snapper2.C
new file mode 100644
index 0000000..3b51a56
--- /dev/null
+++ b/snapper/snapper2.C
@@ -0,0 +1,490 @@
+#include "snapper2.H"
+
+
+// The (private) structure for testing various filters.
+//
+struct filterStats {
+ double L;
+ double H;
+ double V;
+ uint32 tp;
+ uint32 tn;
+ uint32 fp;
+ uint32 fn;
+};
+
+
+
+
+// Shared data
+//
+configuration config;
+sim4parameters sim4params;
+seqCache *genome;
+seqStream *genomeMap;
+seqCache *qsFASTA;
+existDB *maskDB;
+existDB *onlyDB;
+positionDB *positions;
+volatile uint32 numberOfQueries;
+
+int resultFILE;
+int logmsgFILE;
+
+uint32 numFilters;
+uint32 maxFilters;
+filterStats *theFilters;
+
+
+void
+writeValidationFile(char *name) {
+
+ FILE *F = fopen(name, "wb");
+ if (F) {
+ fprintf(F, "%6s %6s %6s %6s %6s %8s %8s %8s %8s\n",
+ "L", "H", "V",
+ "sens", "spec",
+ "tp", "fp", "tn", "fn");
+
+ for (uint32 f=0; f<numFilters; f++) {
+ double sens = 0.0;
+ double spec = 0.0;
+
+ if (theFilters[f].tp + theFilters[f].fn > 0)
+ sens = (double)theFilters[f].tp / (theFilters[f].tp + theFilters[f].fn);
+
+ if (theFilters[f].tn + theFilters[f].fp > 0)
+ spec = (double)theFilters[f].tn / (theFilters[f].tn + theFilters[f].fp);
+
+ fprintf(F, "%6.4f %6.4f %6.4f %6.4f %6.4f "uint32FMTW(8)" "uint32FMTW(8)" "uint32FMTW(8)" "uint32FMTW(8)"\n",
+ theFilters[f].L,
+ theFilters[f].H,
+ theFilters[f].V,
+ sens, spec,
+ theFilters[f].tp,
+ theFilters[f].fp,
+ theFilters[f].tn,
+ theFilters[f].fn);
+ }
+
+ fclose(F);
+ }
+}
+
+
+
+void*
+loaderThread(void *global) {
+ query *q = new query;
+
+ if (q->loadSequence(qsFASTA) == false) {
+ delete q;
+ q = 0L;
+ }
+
+ return(q);
+}
+
+
+
+void
+searchThread(void *global, void *thread, void *thing) {
+ searcherState *state = (searcherState *)thread;
+ query *qry = (query *)thing;
+
+
+ // Do searches.
+ //
+ if (config._doForward)
+ doSearch(state, qry, true);
+ if (config._doReverse)
+ doSearch(state, qry, false);
+
+
+ // Filter the hits
+ //
+ doFilter(state, qry);
+
+
+ // Polish the filtered hits
+ //
+ if (config._polishOptimally)
+ doPolishDP(state, qry);
+ else
+ doPolishS4(state, qry);
+
+
+ // Clean up
+ //
+ delete qry->seq;
+ qry->seq = 0L;
+
+ for (uint32 h=0; h<qry->theHitsLen; h++) {
+ delete qry->theHits[h]._ML;
+ qry->theHits[h]._ML = 0L;
+ }
+
+
+ // If we aren't validating or aren't logging, don't save those pieces, just nuke them now.
+ //
+ if (config._doValidation == false) {
+ delete [] qry->theHits;
+ qry->theHitsLen = 0;
+ qry->theHits = 0L;
+ }
+
+ if (config._logmsgFileName == 0L) {
+ delete qry->theLog;
+ qry->theLog = 0L;
+ }
+}
+
+
+
+void
+writerThread(void *global, void *thing) {
+ query *qry = (query *)thing;
+
+
+ // Write the output, if there is any (zero length just means that
+ // there was no match found).
+ //
+ if ((qry->theOutput != 0L) &&
+ (qry->theOutputLen > 0)) {
+ errno = 0;
+ write(resultFILE, qry->theOutput, sizeof(char) * qry->theOutputLen);
+ if (errno)
+ fprintf(stderr, "Couldn't write to the output file '%s': %s\n",
+ config._outputFileName, strerror(errno)), exit(1);
+ }
+
+
+ // Write the log messages, if any, and if there is a log file
+ //
+ if ((logmsgFILE) && (qry->theLog))
+ qry->theLog->write(logmsgFILE, config._logmsgFileName);
+
+
+ // If we are supposed to be doing validation, test a bunch of
+ // filters here.
+ //
+ if (config._doValidation &&
+ (qry->theHitsLen > 0)) {
+ for (uint32 f=0; f<numFilters; f++) {
+ uint32 cutL = configureFilter(theFilters[f].L,
+ theFilters[f].H,
+ theFilters[f].V,
+ qry->theHits,
+ qry->theHitsLen);
+
+ for (uint32 a=0; a<qry->theHitsLen; a++) {
+ if (qry->theHits[a]._covered < cutL) {
+ // These hits would have been discarded by the filter.
+ //
+ if (qry->theHits[a]._status & AHIT_VERIFIED) {
+ // Oops. We found a high-quality match.
+ theFilters[f].fn++;
+ } else {
+ // Good call. Nothing there!
+ theFilters[f].tn++;
+ }
+ } else {
+ // These hits would have been kept by the filter.
+ //
+ if (qry->theHits[a]._status & AHIT_VERIFIED) {
+ // Allright! Got a high-quality match!
+ theFilters[f].tp++;
+ } else {
+ // Dang. Nothing there.
+ theFilters[f].fp++;
+ }
+ }
+ }
+ }
+
+ // Dump a snapshot of the filter testing
+ //
+ if ((qry->seq->getIID() % 50) == 0)
+ writeValidationFile(config._doValidationFileName);
+ } // doing validation
+
+ delete qry;
+}
+
+
+
+
+
+int
+main(int argc, char **argv) {
+
+ config.read(argc, argv);
+
+ if (config._beVerbose)
+ fprintf(stderr, "Opening the cDNA sequences.\n");
+
+ qsFASTA = new seqCache(config._qsFileName);
+
+ numberOfQueries = qsFASTA->getNumberOfSequences();
+
+
+ // We can save some time and warn of too short and too long
+ // sequences before the table is built.
+ //
+ {
+ uint32 numTooShortQueries = 0;
+ uint32 numTooLongQueries = 0;
+ uint32 numOKQueries = 0;
+ for (uint32 i=0; i<numberOfQueries; i++) {
+ if (qsFASTA->getSequenceLength(i) < config._discardExonLength)
+ numTooShortQueries++;
+ else if (qsFASTA->getSequenceLength(i) >= (uint64ONE << 22))
+ numTooLongQueries++;
+ else
+ numOKQueries++;
+ }
+ if (numTooShortQueries > 0) {
+ fprintf(stderr, "WARNING:\n");
+ fprintf(stderr, "WARNING: Found "uint32FMT" queries shorter than minimum reportable size (-discardexonlength = "uint32FMT")\n",
+ numTooShortQueries, config._discardExonLength);
+ fprintf(stderr, "WARNING:\n");
+ }
+ if (numTooLongQueries > 0) {
+ fprintf(stderr, "WARNING:\n");
+ fprintf(stderr, "WARNING: Found "uint32FMT" queries longer than maximum size ("uint32FMT")\n",
+ numTooLongQueries, uint32ONE << 22);
+ fprintf(stderr, "WARNING:\n");
+ }
+ if (numOKQueries == 0) {
+ fprintf(stderr, "ERROR: Found no queries in acceptable size range!\n");
+ exit(1);
+ }
+ }
+
+
+ // Allocate some structures for doing a validation run. This is
+ // done pretty early, just in case it needs to abort.
+ //
+ numFilters = 0;
+ maxFilters = 21 * 22 / 2 * 20;
+ theFilters = 0L;
+
+ if (config._doValidation) {
+ theFilters = new filterStats [maxFilters];
+
+ for (uint32 h=0; h<=100; h+=5) {
+ for (uint32 l=0; l<=h; l+=5) {
+ for (uint32 v=5; v<=100; v+=5) {
+ if (numFilters >= maxFilters) {
+ fprintf(stderr, "ERROR: Ran out of filterStats structures while configuring the filters!\n");
+ exit(1);
+ }
+
+ theFilters[numFilters].L = l / 100.0;
+ theFilters[numFilters].H = h / 100.0;
+ theFilters[numFilters].V = v / 100.0;
+ theFilters[numFilters].tp = 0;
+ theFilters[numFilters].tn = 0;
+ theFilters[numFilters].fp = 0;
+ theFilters[numFilters].fn = 0;
+ numFilters++;
+ }
+ }
+ }
+
+ fprintf(stderr, "Created "uint32FMT" filters (out of "uint32FMT" available) to test/validate.\n",
+ numFilters, maxFilters);
+ }
+
+
+ // Read in the positionDB if it's already built, or build a new one.
+ //
+ if ((config._psFileName) && (fileExists(config._psFileName))) {
+ if (config._buildOnly) {
+ fprintf(stderr, "All done. Table '%s' already built.\n", config._psFileName);
+ exit(1);
+ } else {
+ fprintf(stderr, "Loading positionDB state from '%s'\n", config._psFileName);
+ positions = new positionDB(config._psFileName, config._KBmerSize, config._merSkip, 0);
+ }
+ } else {
+
+
+
+ // The masking databases
+ //
+ maskDB = 0L;
+#if 0
+ if (config._maskFileName) {
+ if (config._beVerbose)
+ fprintf(stderr, "Building maskDB from fasta file '%s'\n", config._maskFileName);
+ maskDB = new existDB(config._maskFileName, config._KBmerSize, existDBnoFlags, 0, ~uint32ZERO);
+ }
+ if (config._maskPrefix) {
+ if (config._beVerbose)
+ fprintf(stderr, "Building maskDB from meryl prefix '%s'\n", config._maskPrefix);
+ maskDB = new existDB(config._maskPrefix, config._KBmerSize, existDBnoFlags, config._maskThreshold, ~uint32ZERO);
+ }
+#endif
+
+ onlyDB = 0L;
+#if 0
+ if (config._onlyFileName) {
+ if (config._beVerbose)
+ fprintf(stderr, "Building onlyDB from fasta file '%s'\n", config._onlyFileName);
+ onlyDB = new existDB(config._onlyFileName, config._KBmerSize, existDBnoFlags, 0, ~uint32ZERO);
+ }
+ if (config._onlyPrefix) {
+ if (config._beVerbose)
+ fprintf(stderr, "Building onlyDB from meryl prefix '%s'\n", config._onlyPrefix);
+ onlyDB = new existDB(config._onlyPrefix, config._KBmerSize, existDBnoFlags, 0, config._onlyThreshold);
+ }
+#endif
+
+ if ((config._maskFileName) ||
+ (config._maskPrefix) ||
+ (config._onlyFileName) ||
+ (config._onlyPrefix)) {
+ fprintf(stderr, "maskDB/onlyDB not currently supported.\n");
+ exit(1);
+ }
+
+ merStream *MS = new merStream(new kMerBuilder(config._KBmerSize, config._KBcompression, config._KBspacingTemplate),
+ new seqStream(config._dbFileName),
+ true, true);
+
+ positions = new positionDB(MS,
+ config._KBmerSize,
+ config._merSkip,
+ maskDB,
+ onlyDB,
+ 0L,
+ 0,
+ config._ignoreThreshold,
+ 0,
+ 0,
+ config._beVerbose);
+
+ delete MS;
+
+ delete maskDB;
+ delete onlyDB;
+
+ maskDB = 0L;
+ onlyDB = 0L;
+
+ if (config._psFileName) {
+ if (config._beVerbose)
+ fprintf(stderr, "Dumping positions table to '%s'\n", config._psFileName);
+
+ positions->saveState(config._psFileName);
+
+ if (config._buildOnly)
+ exit(0);
+
+ delete positions;
+ positions = new positionDB(config._psFileName, config._KBmerSize, config._merSkip, 0);
+ }
+ }
+
+
+ // Open and init the genomic sequences.
+ //
+ if (config._beVerbose)
+ fprintf(stderr, "Opening the genomic database.\n");
+
+ genome = new seqCache(config._dbFileName, false);
+ genome->loadAllSequences();
+
+ genomeMap = new seqStream(config._dbFileName);
+
+
+
+ //
+ // Configure sim4
+ //
+ sim4params.setPrintAlignments(config._doAlignments);
+ sim4params.setFindAllExons();
+ sim4params.setMinCoverage(MAX(0.0, config._minMatchCoverage / 100.0 - 0.1));
+ sim4params.setMinPercentExonIdentity(config._minMatchIdentity - 5);
+ sim4params.setIgnorePolyTails(false);
+ //sim4params.setSlideIntrons(false); // see sim4b1.C for why this is disabled
+
+ //sim4params.setWordSize(14);
+ //sim4params.setWordSizeInt(14);
+ //sim4params.setWordSizeExt(14);
+
+ //
+ // Open output files
+ //
+ resultFILE = fileno(stdout);
+ logmsgFILE = 0;
+
+ if (config._outputFileName) {
+ errno = 0;
+ resultFILE = open(config._outputFileName,
+ O_WRONLY | O_LARGEFILE | O_CREAT | O_TRUNC,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno)
+ fprintf(stderr, "Couldn't open the output file '%s': %s\n", config._outputFileName, strerror(errno)), exit(1);
+ }
+
+ if (config._logmsgFileName) {
+ errno = 0;
+ logmsgFILE = open(config._logmsgFileName,
+ O_WRONLY | O_LARGEFILE | O_CREAT | O_TRUNC,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
+ if (errno)
+ fprintf(stderr, "Couldn't open the log message file '%s': %s\n", config._logmsgFileName, strerror(errno)), exit(1);
+ }
+
+ //
+ // Initialize threads
+ //
+
+ sweatShop *ss = new sweatShop(loaderThread,
+ searchThread,
+ writerThread);
+
+ ss->setNumberOfWorkers(config._numSearchThreads);
+
+ ss->setWriterQueueSize(16384);
+
+ for (uint32 i=0; i<config._numSearchThreads; i++)
+ ss->setThreadData(i, new searcherState(i));
+
+ ss->run(0L, config._beVerbose);
+
+ delete ss;
+
+ if (resultFILE != fileno(stdout))
+ close(resultFILE);
+
+ if (logmsgFILE != 0)
+ close(logmsgFILE);
+
+
+ // Summarize the filter test results
+ //
+ if (config._doValidation)
+ writeValidationFile(config._doValidationFileName);
+
+
+ // Clean up
+ //
+ delete genome;
+ delete genomeMap;
+
+ if (config._doValidation)
+ delete [] theFilters;
+
+ delete qsFASTA;
+
+ delete maskDB;
+ delete onlyDB;
+
+ delete positions;
+
+ return(0);
+}
+
diff --git a/snapper/snapper2.H b/snapper/snapper2.H
new file mode 100644
index 0000000..697c2a6
--- /dev/null
+++ b/snapper/snapper2.H
@@ -0,0 +1,468 @@
+#include <pthread.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/utsname.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <math.h>
+
+#include <new>
+
+#include "bio++.H"
+
+#include "positionDB.H"
+#include "existDB.H"
+#include "sim4.H"
+
+#include "logMsg.H"
+
+
+#define MAX_THREADS 64
+
+
+// A meta-option. Enable all the good stuff. Or not, I guess.
+//
+#if 0
+
+// Define this to print a message for each search, showing times and
+// stats, unless it finished quicker than some minimum time.
+//
+#define VERBOSE_SEARCH
+#define VERBOSE_SEARCH_MINIMUM_TIME 1.0
+
+// Define this to print the number of hits (raw and filtered) for each
+// sequence, unless it has fewer than some minimum number of raw hits.
+//
+#define VERBOSE_FILTER
+#define VERBOSE_FILTER_MINIMUM 10
+
+// Define this to show the hitMatrix
+#define SHOW_HITMATRIX
+
+// Define this to print a message whenever a polish starts.
+//
+#define SHOW_POLISHING
+
+// Define these to show polishes that take a long time -- individual
+// polishes, not all polishes for a single sequence. The time is in
+// seconds.
+//
+#define SHOW_POLISHING_EXPENSIVE 0.5
+
+// Define this to show the exon discarding and match splitting.
+//
+#define SHOW_MATCH_SPLITTING
+
+// Define this to show the "hit discarding" results. Any hits
+// that look like they are repeats are re-searched using nearly
+// unique mers.
+//
+#define SHOW_HIT_DISCARDING
+
+// Define this to show the kmers being added to the sim4command from
+// thr-polish.C. This generates a lot of output!
+//
+#define SHOW_HITS_ADDED
+//#define SHOW_HITS_ADDED_AFTER_QUERY 0
+
+// END OF GOOD STUFF!
+#endif
+
+
+
+class configuration {
+public:
+ configuration();
+ ~configuration();
+
+ void read(int argc, char **argv);
+
+ void setTime(struct timespec *ts, double t) {
+ ts->tv_sec = (time_t)floor(t);
+ ts->tv_nsec = (long)((t - ts->tv_sec) * 1e9);
+ };
+
+public:
+ bool _beVerbose;
+
+ // These are private to the kMerBuilder -- in particular, we cannot
+ // use _merSize as a surrogate for the length of the seed,
+ // compressed seeds can be much longer.
+ //
+ uint32 _KBmerSize;
+ uint32 _KBcompression;
+ char *_KBspacingTemplate;
+
+ uint32 _merSkip;
+
+ uint32 _numSearchThreads;
+
+ bool _doReverse;
+ bool _doForward;
+ bool _doValidation;
+ char *_doValidationFileName;
+
+ bool _doAlignments;
+
+ double _Lo;
+ double _Hi;
+ double _Va;
+
+ uint32 _maxDiagonal;
+
+ double _extendWeight;
+ uint32 _extendMinimum;
+ uint32 _extendMaximum;
+
+ uint32 _repeatThreshold;
+
+ // Minimums for hits
+ double _minHitCoverage;
+ uint32 _minHitLength;
+
+ // Minimums for matches
+ uint32 _minMatchIdentity;
+ uint32 _minMatchCoverage;
+
+ // Filtering of hits
+ bool _afEnabled;
+ double _afThreshold;
+ uint32 _afLength;
+ uint32 _afInit;
+
+ // Filtering and/or cleanup of matches
+ uint32 _discardExonLength;
+ uint32 _discardExonQuality;
+ bool _splitMatches;
+ bool _polishOptimally;
+
+ char *_dbFileName;
+ char *_psFileName;
+ char *_qsFileName;
+
+ char *_maskFileName;
+ char *_onlyFileName;
+
+ bool _buildOnly;
+
+ uint32 _ignoreThreshold;
+ char *_maskPrefix;
+ uint32 _maskThreshold;
+ char *_onlyPrefix;
+ uint32 _onlyThreshold;
+
+ char *_outputFileName;
+ char *_logmsgFileName;
+ char *_statsFileName;
+};
+
+
+
+
+
+
+// Shared data
+//
+extern configuration config;
+extern sim4parameters sim4params;
+
+extern seqCache *genome;
+extern seqStream *genomeMap;
+
+extern seqCache *qsFASTA; // Used exclusively by thr-loader.C
+
+extern existDB *maskDB; // thr-search.C
+extern existDB *onlyDB; // thr-search.C
+
+extern positionDB *positions;
+
+extern volatile uint32 numberOfQueries;
+
+
+
+// aHit -- storing the internal hits
+//
+// _status
+// & 0x00000001 -- direction, forward if set, otherwise reverse
+// 0x00000002 -- not filtered, if set, polish the hit
+// 0x00000004 -- not filtered, if set, polish the hit because it has something unique-ish
+// 0x00000008 -- match verified via polishng
+// 0x00000010 -- match discarded via hit refinement
+// 0x0000fff8 -- unused
+// 0x00ff0000 -- percent idendity of match
+// 0xff000000 -- percent coverage of match
+//
+#define AHIT_DIRECTION_MASK 0x00000001
+#define AHIT_POLISHABLE 0x00000002
+#define AHIT_HAS_UNIQUE 0x00000004
+#define AHIT_VERIFIED 0x00000008
+#define AHIT_DISCARDED 0x00000010
+
+
+struct aHit {
+ uint32 _status;
+ uint32 _qsIdx;
+ uint32 _dsIdx;
+ uint32 _dsLo;
+ uint32 _dsHi;
+ uint32 _covered;
+ uint32 _matched;
+ uint32 _numMers;
+ merList *_ML;
+};
+
+
+
+class query {
+public:
+ query() {
+ seq = 0L;
+
+ theHitsLen = 0;
+ theHitsMax = 4;
+ theHits = new aHit [theHitsMax];
+
+ theLog = 0L;
+ if (config._logmsgFileName)
+ theLog = new logMsg(true);
+
+ theOutputLen = 0;
+ theOutputMax = 0;
+ theOutput = 0L;
+ };
+
+ ~query() {
+ delete seq;
+ delete [] theHits;
+ delete [] theLog;
+ delete [] theOutput;
+ };
+
+ bool loadSequence(seqCache *qs) {
+ seq = qs->getSequenceInCore();
+ return(seq != 0L);
+ };
+
+ seqInCore *seq;
+ uint32 theHitsLen;
+ uint32 theHitsMax;
+ aHit *theHits;
+
+ logMsg *theLog;
+
+ uint32 theOutputLen;
+ uint32 theOutputMax;
+ char *theOutput;
+};
+
+
+
+class searcherState {
+public:
+ uint64 threadID;
+
+ uint64 posnMax;
+ uint64 posnLen;
+ uint64 *posn;
+
+ kMerBuilder *KB;
+ void *DP;
+
+ searcherState(uint64 U) {
+ threadID = U;
+
+ posnMax = 0;
+ posnLen = 0;
+ posn = 0L;
+
+ KB = 0L;
+ DP = 0L;
+ };
+
+ ~searcherState() {
+ delete [] posn;
+ delete KB;
+ };
+};
+
+
+
+
+struct diagonalLine {
+ union {
+ uint64 all;
+ struct {
+ uint64 uniq : 10; // uniqueness score for the mer here
+ uint64 qPos : 22; // position in the query, 0 to 4M
+ uint64 dPos : 32; // position in the genome, 0 to 4G
+ } val;
+ };
+};
+
+
+
+class trapMatch {
+public:
+ uint32 _unique;
+ uint32 _qsLo;
+ uint32 _qsHi;
+ uint32 _dsLo;
+ uint32 _dsHi;
+
+ merCovering *_IL;
+ merList *_ML;
+
+ trapMatch *_next;
+
+ trapMatch(uint32 isunique,
+ uint32 qsLo,
+ uint32 qsHi,
+ uint32 dsLo,
+ uint32 dsHi,
+ merCovering *IL,
+ merList *ML) {
+ _unique = isunique;
+ _qsLo = qsLo;
+ _qsHi = qsHi;
+ _dsLo = dsLo;
+ _dsHi = dsHi;
+ _IL = IL;
+ _ML = ML;
+ _next = 0L;
+ };
+};
+
+
+
+
+class hitMatrix {
+public:
+ hitMatrix(uint32 qsLen, uint32 q, uint32 qsIdx, logMsg *theLog);
+ ~hitMatrix();
+
+ void addHits(uint32 qi,
+ uint64 *ps,
+ uint64 cn,
+ uint64 ad=0);
+
+ void sort_diagonal(void);
+ void sort_dsPos(void);
+
+ void filter(char direction,
+ double minHitCoverage,
+ uint32 minHitLength,
+ aHit *&theOutput,
+ uint32 &theOutputPos,
+ uint32 &theOutputMax);
+private:
+ uint32 _qsLen; // Seq Len of Q
+ uint32 _qsMers; // Valid mers in Q
+ uint32 _qsIdx; // Index of Q in the FastA
+
+ // Instead of building the lines during add(), we store
+ // the information used to build lines, and then build them
+ // in chain(). This was done to reduce simultaneous memory
+ // usage, as the lineArrayMap and etc take up considerable space.
+ //
+ uint32 _hitsLen;
+ uint32 _hitsMax;
+ diagonalLine *_hits;
+
+ logMsg *_theLog;
+
+ // Making sense of the raw output from the search is not a trivial
+ // task for perl. SMALL searches (dbEST vs 0.5MB sequence) used more
+ // than 4GB of memory in perl.
+ //
+ // So, we bite the bullet and do it here.
+ //
+ // _matches is a sorted linked list of the regions we have found.
+ // The list is kept in REVERSE order, as we usually add regions
+ // in the correct order (correct reverse order), occasionally
+ // we need to swap the last two.
+ //
+ // The list is deleted in filter()
+ //
+ trapMatch *_matches;
+
+ void addMatch(uint32 isunique,
+ uint32 qsLo,
+ uint32 qsHi,
+ uint32 dsLo,
+ uint32 dsHi,
+ merCovering *IL,
+ merList *ML);
+
+};
+
+
+inline
+void
+hitMatrix::addHits(uint32 qi,
+ uint64 *ps,
+ uint64 cn,
+ uint64 ad) {
+
+ if ((_hitsLen + cn) >= _hitsMax) {
+ _hitsMax = _hitsMax + _hitsMax + (uint32)cn;
+
+ diagonalLine *h;
+ try {
+ h = new diagonalLine [_hitsMax];
+ } catch (std::bad_alloc) {
+ fprintf(stderr, "hitMatrix::addHits()-- caught std::bad_alloc in %s at line %d\n", __FILE__, __LINE__);
+ fprintf(stderr, "hitMatrix::addHits()-- have "uint32FMT" hits, tried to add "uint64FMT" more\n", _hitsLen, cn);
+ exit(1);
+ }
+
+ memcpy(h, _hits, sizeof(uint64) * _hitsLen);
+ delete [] _hits;
+ _hits = h;
+ }
+
+ uint64 uniq = cn;
+ if (ad > 0)
+ uniq = ad;
+ if (uniq > 0x000003ff)
+ uniq = 0x000003ff;
+
+ for (uint64 i=0; i<cn; i++) {
+ _hits[_hitsLen].val.uniq = uniq;
+ _hits[_hitsLen].val.qPos = qi;
+ _hits[_hitsLen].val.dPos = ps[i];
+ _hitsLen++;
+ }
+}
+
+
+
+void
+doSearch(searcherState *state,
+ query *qry,
+ bool rc);
+
+void
+doFilter(searcherState *state,
+ query *qry);
+
+void
+doPolishS4(searcherState *state,
+ query *qry);
+
+void
+doPolishDP(searcherState *state,
+ query *qry);
+
+
+uint32 configureFilter(double L,
+ double H,
+ double V,
+ aHit *theHits,
+ uint32 theHitsLen);
diff --git a/snapper/test/Makefile b/snapper/test/Makefile
new file mode 100644
index 0000000..96a3996
--- /dev/null
+++ b/snapper/test/Makefile
@@ -0,0 +1,21 @@
+
+
+
+all:
+ @../../leaff/leaff -G 1 2000 2000 > 1.f
+ @../../leaff/leaff -G 1 700 700 > 2.f
+ @../../leaff/leaff -G 1 3000 3000 > 3.f
+ @echo ">g1" > g.f
+ @cat 1.f 2.f 3.f | grep -v '>' >> g.f
+ @echo ">g2" >> g.f
+ @cat 3.f 2.f 1.f | grep -v '>' >> g.f
+ @echo ">g3" >> g.f
+ @cat 1.f 2.f 3.f | grep -v '>' >> g.f
+ @rm -f 1.f 3.f
+ ../snapper2 -mersize 23 -queries 2.f -genomic g.f -output x.out
+ @cat x.out
+ @echo ""
+ @echo "Check that the ranges are 1900-2800, 2900-3800 and 1900-2800"
+ @echo "Check that ddefs change"
+ @echo "Check that exons are the same"
+ @rm -f 1.f* 2.f* 3.f* g.f* x.tmp* x.out*
diff --git a/snapper/thr-filter.C b/snapper/thr-filter.C
new file mode 100644
index 0000000..ed21f6b
--- /dev/null
+++ b/snapper/thr-filter.C
@@ -0,0 +1,126 @@
+#include "snapper2.H"
+
+
+
+uint32
+configureFilter(double L,
+ double H,
+ double V,
+ aHit *theHits,
+ uint32 theHitsLen) {
+
+ // Find the highest and lowest quality hit
+ //
+ uint32 hiQ = theHits[0]._covered;
+ uint32 loQ = hiQ;
+
+ for (uint32 i=0; i<theHitsLen; i++) {
+ if (hiQ < theHits[i]._covered)
+ hiQ = theHits[i]._covered;
+ if (loQ > theHits[i]._covered)
+ loQ = theHits[i]._covered;
+ }
+
+ // _numMers is not the same as the number covered, so we should
+ // ensure that h is in range.
+ //
+ // Note: _numMers is constant for all hits, so we can use any of them
+ //
+ double h = (double)(hiQ - loQ) / (double)theHits[0]._numMers;
+ if (h > 1.0)
+ h = 1.0;
+
+ double p = 0.0;
+ if (h <= L) p = 1.0;
+ else if (h >= H) p = V;
+ else p = 1.0 - (1.0 - V) * (h - L) / (H - L);
+
+ if (p > 1.0) {
+ fprintf(stderr, "error in p; p=%f > 1.0! h=%f (L=%f H=%f V=%f)\n", p, h, L, H, V);
+ p = 1.0;
+ }
+
+ if (V - p > 1e-10) {
+ fprintf(stderr, "error in p; p=%f < V! h=%f (L=%f H=%f V=%f)\n", p, h, L, H, V);
+ p = V;
+ }
+
+ // Any thing at or above cutL is good, and we should polish it.
+ // Anything below is junk, and we should ignore it.
+ //
+ return((uint32)floor(hiQ - p * h * theHits[0]._numMers));
+}
+
+
+
+int
+aHitAutoFilterSort(const void *a, const void *b) {
+ const aHit *A = (const aHit *)a;
+ const aHit *B = (const aHit *)b;
+
+ // If either was discarded, we don't care the order,
+ // just throw them at the end of the array
+ //
+ if ((A->_status & AHIT_DISCARDED) ||
+ (B->_status & AHIT_DISCARDED)) {
+ if (A->_status & AHIT_DISCARDED)
+ return(1);
+ else if (B->_status & AHIT_DISCARDED)
+ return(-1);
+ return(0);
+ }
+
+ // Otherwise, snapper filters simply on coverage.
+ //
+ if (A->_covered > B->_covered)
+ return(-1);
+ else if (A->_covered < B->_covered)
+ return(1);
+ return(0);
+}
+
+
+
+void
+doFilter(searcherState *state,
+ query *qry) {
+
+ if (qry->theHitsLen == 0)
+ return;
+
+ uint32 numF = 0;
+
+ // Auto filter -- keep polishing until a running average of
+ // polishes falls below some threshold.
+ //
+ if (config._afEnabled) {
+ qsort(qry->theHits, qry->theHitsLen, sizeof(aHit), aHitAutoFilterSort);
+
+ for (uint32 i=0; i < qry->theHitsLen; i++)
+ qry->theHits[i]._status |= AHIT_POLISHABLE;
+
+ numF = qry->theHitsLen;
+
+ } else {
+ uint32 cutL = configureFilter(config._Lo,
+ config._Hi,
+ config._Va, qry->theHits, qry->theHitsLen);
+
+ // If the coverage of the hit is more than the minimum, mark the
+ // hit as polishable. Unless the hit was discarded.
+
+ for (uint32 i=0; i < qry->theHitsLen; i++) {
+ if (!(qry->theHits[i]._status & AHIT_DISCARDED) &&
+ (qry->theHits[i]._covered >= cutL)) {
+ qry->theHits[i]._status |= AHIT_POLISHABLE;
+ numF++;
+ }
+ }
+ }
+
+#ifdef VERBOSE_FILTER
+ if (qry->theHitsLen >= VERBOSE_FILTER_MINIMUM)
+ theLog->add("Query "uint32FMT" with "uint32FMT" good hits out of "uint32FMT" total hits.\n",
+ idx, numF, qry->theHitsLen);
+#endif
+}
diff --git a/snapper/thr-polish-dp.C b/snapper/thr-polish-dp.C
new file mode 100644
index 0000000..ff2538e
--- /dev/null
+++ b/snapper/thr-polish-dp.C
@@ -0,0 +1,489 @@
+#include "snapper2.H"
+
+
+#define MATCH 0
+#define GAPA 1
+#define GAPB 2
+#define STOP 3
+
+#define MATCHSCORE 2
+#define GAPSCORE -3
+#define MISMATCHSCORE -1
+
+
+void
+reverse(char *a, char *b, int len) {
+ char c=0;
+ char *s=a, *S=a+len-1;
+ char *q=b, *Q=b+len-1;
+
+ while (s < S) {
+ c = *s;
+ *s++ = *S;
+ *S-- = c;
+
+ c = *q;
+ *q++ = *Q;
+ *Q-- = c;
+ }
+}
+
+
+
+class dpMatch {
+public:
+ dpMatch() {
+ matches = 0;
+ alignLen = 0;
+
+ begI = begJ = 0;
+ endI = endJ = 0;
+ lenA = lenB = 0;
+ };
+
+ int matches;
+ int alignLen;
+
+ int begI, begJ;
+ int endI, endJ;
+ int lenA, lenB;
+
+ char *alignA;
+ char *alignB;
+};
+
+
+class dpMatrix {
+private:
+ typedef struct {
+ unsigned int score : 30;
+ unsigned int action : 2;
+ } dpCell;
+
+public:
+ dpMatrix() {
+ aMax = 0;
+ bMax = 0;
+
+ alignA = 0L;
+ alignB = 0L;
+ matrix = 0L;
+ };
+
+ ~dpMatrix() {
+ delete [] alignA;
+ delete [] alignB;
+ delete [] matrix;
+ };
+
+ void dpMatrixInit(int lenA, int lenB) {
+
+ if ((aMax <= lenA) || (bMax <= lenB)) {
+ delete [] alignA;
+ delete [] alignB;
+ delete [] matrix;
+
+ aMax = MAX(aMax, lenA) + 1000;
+ bMax = MAX(bMax, lenB) + 1000;
+
+ fprintf(stderr, "dpMatrix-- reallocate to "uint32FMT" x "uint32FMT"\n", aMax, bMax);
+
+ alignA = new char [aMax + bMax + 1];
+ alignB = new char [bMax + bMax + 1];
+ matrix = new dpCell [aMax * bMax];
+ }
+
+ int i, j, p = 0;
+
+ for (i=0; i<lenA+1; i++) {
+ matrix[p].score = 1 << 29;
+ matrix[p].action = STOP;
+ p += bMax;
+ }
+
+ p = 0;
+ for (j=0; j<lenB+1; j++) {
+ matrix[p].score = 1 << 29;
+ matrix[p].action = STOP;
+ p++;
+ }
+ };
+
+ int dpMatrixCellGetScore(int a, int b) {
+ return(matrix[a * bMax + b].score);
+ };
+
+ int dpMatrixCellGetAction(int a, int b) {
+ return(matrix[a * bMax + b].action);
+ };
+
+ void dpMatrixCellSet(int a, int b, int score, int action) {
+ dpCell x;
+ x.score = score;
+ x.action = action;
+ matrix[a * bMax + b] = x;
+ };
+
+ dpMatch *dpAlign(char *stringA, int lenA,
+ char *stringB, int lenB,
+ dpMatch *match);
+private:
+ int aMax;
+ int bMax;
+
+ char *alignA;
+ char *alignB;
+ dpCell *matrix;
+};
+
+
+dpMatch *
+dpMatrix::dpAlign(char *stringA, int lenA,
+ char *stringB, int lenB,
+ dpMatch *match) {
+
+ int i, j;
+
+ dpMatrixInit(lenA, lenB);
+
+ int scoreMax = 0;
+
+ int begI=0, endI=0, curI=0;
+ int begJ=0, endJ=0, curJ=0;
+
+ for (i=1; i<=lenA; i++){
+ for (j=1; j<=lenB; j++){
+
+ // Pick the max of these
+
+ int ul = dpMatrixCellGetScore(i-1, j-1) + ((stringA[i-1] == stringB[j-1]) ? MATCHSCORE : MISMATCHSCORE);
+ int lf = dpMatrixCellGetScore(i-1, j) + GAPSCORE;
+ int up = dpMatrixCellGetScore(i, j-1) + GAPSCORE;
+
+ // (i,j) is the beginning of a subsequence, our default behavior
+ int sc = 1 << 29;
+ int ac = STOP;
+
+ if (sc < ul) {
+ sc = ul;
+ ac = MATCH;
+ }
+
+ if (sc < lf) {
+ sc = lf;
+ ac = GAPB;
+ }
+
+ if (sc < up) {
+ sc = up;
+ ac = GAPA;
+ }
+
+ dpMatrixCellSet(i, j, sc, ac);
+
+ if (scoreMax < sc) {
+ scoreMax = sc;
+ endI = curI = i;
+ endJ = curJ = j;
+ }
+ }
+ }
+
+ //fprintf(stdout, "SCORE %d at %d,%d\n", scoreMax - (1 << 29), endI, endJ);
+
+ int alignLen = 0;
+ int matches = 0;
+ int terminate = 0;
+
+ while (terminate == 0) {
+ switch (dpMatrixCellGetAction(curI, curJ)) {
+ case STOP:
+ terminate = 1;
+ break;
+ case MATCH:
+ alignA[alignLen] = stringA[curI-1];
+ alignB[alignLen] = stringB[curJ-1];
+
+ if (alignA[alignLen] == alignB[alignLen]) {
+ alignA[alignLen] = tolower(alignA[alignLen]);
+ alignB[alignLen] = tolower(alignB[alignLen]);
+ matches++;
+ } else {
+ //fprintf(stdout, "MIS at %d\n", alignLen);
+ alignA[alignLen] = toupper(alignA[alignLen]);
+ alignB[alignLen] = toupper(alignB[alignLen]);
+ }
+
+ curI--;
+ curJ--;
+ alignLen++;
+ break;
+ case GAPA:
+ //fprintf(stdout, "GAPA at %d\n", alignLen);
+ alignA[alignLen] = '-';
+ alignB[alignLen] = stringB[curJ-1];
+ curJ--;
+ alignLen++;
+ break;
+ case GAPB:
+ //fprintf(stdout, "GAPB at %d\n", alignLen);
+ alignA[alignLen] = stringA[curI-1];
+ alignB[alignLen] = '-';
+ curI--;
+ alignLen++;
+ break;
+ }
+ }
+
+ begI = curI;
+ begJ = curJ;
+
+ alignA[alignLen] = 0;
+ alignB[alignLen] = 0;
+
+ reverse(alignA, alignB, alignLen);
+
+ match->matches = matches;
+ match->alignLen = alignLen;
+ match->begI = begI;
+ match->begJ = begJ;
+ match->endI = endI;
+ match->endJ = endJ;
+ match->lenA = lenA;
+ match->lenB = lenB;
+
+//warning alignA and alignB aliases to dpMatrix
+ match->alignA = alignA;
+ match->alignB = alignB;
+
+ return(match);
+}
+
+
+
+
+
+
+
+
+void
+doPolishDP(searcherState *state,
+ query *qry) {
+
+ // For the autofilter
+ uint64 successes = uint64ZERO;
+ uint64 successMask = uint64MASK(config._afLength);
+ uint32 attempts = 0;
+
+ if (qry->theHitsLen == 0)
+ return;
+
+ qry->theOutputLen = 0;
+ qry->theOutputMax = 2 * 1024 * qry->theHitsLen;
+ qry->theOutput = new char [qry->theOutputMax];
+
+ qry->theOutput[0] = 0;
+
+ // Move these to searcherState!
+
+ if (state->DP == 0L)
+ state->DP = new dpMatrix;
+
+ dpMatch match;
+ dpMatrix *matrix = (dpMatrix *)state->DP;
+
+ for (uint32 h=0; h<qry->theHitsLen; h++) {
+
+ // If the hit was discarded, move along.
+ //
+ if (qry->theHits[h]._status & AHIT_DISCARDED)
+ continue;
+
+ // If the hit was filtered out, move along.
+ //
+ if ((config._doValidation == false) &&
+ ((qry->theHits[h]._status & AHIT_POLISHABLE) == 0) &&
+ ((qry->theHits[h]._status & AHIT_HAS_UNIQUE) == 0))
+ continue;
+
+ // If our recent success rate is pretty terrible, continue.
+ //
+ if (config._afEnabled) {
+ if (attempts > config._afInit) {
+ double rat = countNumberOfSetBits64(successes) / (double)((attempts < config._afLength) ? attempts : config._afLength);
+
+ // If we've hit the end of the good polishes, give up. But
+ // still do all the stuff with unique mers in them.
+ //
+ if (((qry->theHits[h]._status & AHIT_HAS_UNIQUE) == 0) &&
+ (rat < config._afThreshold))
+ continue;
+ }
+
+ attempts++;
+ }
+
+ //
+ // Polish it up!
+ //
+
+ seqInCore *QRYseq = qry->seq;
+ seqInCore *GENseq = genome->getSequenceInCore(qry->theHits[h]._dsIdx);
+ uint32 GENlo = qry->theHits[h]._dsLo;
+ uint32 GENhi = qry->theHits[h]._dsHi;
+
+ char *q = QRYseq->sequence();
+ char *g = GENseq->sequence() + GENlo;
+
+ if (GENhi > GENseq->sequenceLength())
+ GENhi = GENseq->sequenceLength();
+
+ uint32 qlen = qry->seq->sequenceLength();
+ uint32 glen = GENhi - GENlo;
+
+ bool doForward = qry->theHits[h]._status & AHIT_DIRECTION_MASK;
+ bool doReverse = !doForward;
+
+ if (doReverse) {
+ reverseComplementSequence(q, qlen);
+ }
+
+#if 0
+ fprintf(stderr, "align QRYlen="uint32FMT" GEN="uint32FMT"-"uint32FMT" GENlen="uint32FMT"\n",
+ qlen, GENlo, GENhi, glen);
+#endif
+
+ //if ((qlen * 3 > glen) && ((qlen / 1024) * (glen / 1024) < 4 * 1024))
+
+ matrix->dpAlign(q, qlen, g, glen, &match);
+
+ if (doReverse) {
+ reverseComplementSequence(q, qlen);
+
+ uint32 x = match.begI;
+ match.begI = qlen - match.endI;
+ match.endI = qlen - x;
+ }
+
+
+ // Build the proper match if it's even remotely good
+ //
+ if (match.matches > 0) {
+ sim4polish p;
+ sim4polishExon e;
+
+ qry->theHits[h]._status |= AHIT_VERIFIED;
+
+ p._estID = QRYseq->getIID();
+ p._estLen = QRYseq->sequenceLength();
+ p._estPolyA = 0;
+ p._estPolyT = 0;
+
+ p._genID = GENseq->getIID();
+ p._genRegionOffset = GENlo;
+ p._genRegionLength = GENhi - GENlo;
+
+ p._numMatches = match.matches;
+ p._numMatchesN = 0;
+ p._numCovered = match.endI - match.begI;
+
+ p._percentIdentity = 0;
+ p._querySeqIdentity = 0;
+
+ p._matchOrientation = (doReverse) ? SIM4_MATCH_COMPLEMENT : SIM4_MATCH_FORWARD;
+ p._strandOrientation = SIM4_STRAND_UNKNOWN;
+
+ p._comment = NULL;
+ p._estDefLine = QRYseq->header();
+ p._genDefLine = GENseq->header();
+
+ p._numExons = 1;
+ p._exons = &e;
+
+ e._estFrom = match.begI + 1;
+ e._estTo = match.endI;
+ e._genFrom = match.begJ + GENlo + 1;
+ e._genTo = match.endJ + GENlo;
+ e._numMatches = match.matches;
+ e._numMatchesN = 0;
+ e._percentIdentity = 0;
+ e._intronOrientation = SIM4_INTRON_NONE;
+
+ // The alignments are needed for updateAlignmentScores().
+
+ e._estAlignment = match.alignA; // 'e' DOES NOT own this, must reset the pointer later.
+ e._genAlignment = match.alignB;
+
+ p.s4p_updateAlignmentScores();
+
+ // Since we're not using sim4, the normal method of ignoring aligns doesn't work.
+ // Do it explicitly.
+
+ if (config._doAlignments == false) {
+ e._estAlignment = NULL;
+ e._genAlignment = NULL;
+ }
+
+
+ // Save it if it is truely good.
+ if ((p._percentIdentity >= config._minMatchIdentity) &&
+ (p._querySeqIdentity >= config._minMatchCoverage)) {
+ char *pstr = p.s4p_polishToString(sim4polishStyleDefault);
+
+ uint32 l = (uint32)strlen(pstr);
+
+ if (qry->theOutputLen + l + 1 >= qry->theOutputMax) {
+ qry->theOutputMax = qry->theOutputMax + qry->theOutputMax + l;
+ char *o = 0L;
+ try {
+ o = new char [qry->theOutputMax];
+ } catch (...) {
+ fprintf(stderr, "doPolish()-- Can't reallocate space for the output string ("uint32FMT" bytes) in thread "uint64FMT"\n", qry->theOutputMax, state->threadID);
+ abort();
+ }
+ memcpy(o, qry->theOutput, sizeof(char) * qry->theOutputLen);
+ delete [] qry->theOutput;
+ qry->theOutput = o;
+ }
+
+ memcpy(qry->theOutput + qry->theOutputLen, pstr, sizeof(char) * l);
+ qry->theOutputLen += l;
+
+ qry->theOutput[qry->theOutputLen] = 0;
+
+ delete [] pstr;
+
+ // Save the best scores
+ //
+ uint32 pi = p._percentIdentity;
+ uint32 pc = p._querySeqIdentity;
+
+ qry->theHits[h]._status |= pi << 16;
+ qry->theHits[h]._status |= pc << 24;
+
+ successes <<= 1;
+ if ((pi >= config._minMatchIdentity) &&
+ (pc >= config._minMatchCoverage)) {
+ //fprintf(stderr, "GOOD "uint32FMT" "uint32FMT"\n", pi, pc);
+ successes |= uint64ONE;
+ } else {
+ //fprintf(stderr, "BAD "uint32FMT" "uint32FMT"\n", pi, pc);
+ successes |= uint64ZERO;
+ }
+ successes &= successMask;
+ }
+
+ // Before sim4polish and sim4polishExon go out of scope, reset the pointers. Ugly, but needs
+ // to be done else the destructors try to delete things it doesn't own (alignments) or
+ // allocated on the stack (sim4polishExon).
+ p._estDefLine = 0L;
+ p._genDefLine = 0L;
+ p._exons = 0L;
+ e._estAlignment = 0L;
+ e._genAlignment = 0L;
+ }
+
+ delete GENseq;
+ } // over all hits
+}
+
diff --git a/snapper/thr-polish.C b/snapper/thr-polish.C
new file mode 100644
index 0000000..298fa98
--- /dev/null
+++ b/snapper/thr-polish.C
@@ -0,0 +1,360 @@
+#include "snapper2.H"
+
+
+
+void
+doPolishS4(searcherState *state,
+ query *qry) {
+
+ // For the autofilter
+ uint64 successes = uint64ZERO;
+ uint64 successMask = uint64MASK(config._afLength);
+ uint32 attempts = 0;
+
+ if (qry->theHitsLen == 0)
+ return;
+
+ qry->theOutputLen = 0;
+ qry->theOutputMax = 2 * 1024 * qry->theHitsLen;
+ qry->theOutput = new char [qry->theOutputMax];
+
+ qry->theOutput[0] = 0;
+
+ for (uint32 h=0; h<qry->theHitsLen; h++) {
+
+ // If the hit was discarded, move along.
+ //
+ if (qry->theHits[h]._status & AHIT_DISCARDED) {
+#ifdef SHOW_HIT_DISCARDING
+ qry->theLog->add("Hit %u out of %u (%u -> %u[%u-%u]) cov=%u matched=%u numMers=%u DISCARDED\n",
+ h, qry->theHitsLen,
+ qry->seq->getIID(),
+ qry->theHits[h]._dsIdx,
+ qry->theHits[h]._dsLo,
+ qry->theHits[h]._dsHi,
+ qry->theHits[h]._covered,
+ qry->theHits[h]._matched,
+ qry->theHits[h]._numMers);
+#endif
+ continue;
+ }
+
+
+ // If the hit was filtered out, move along.
+ //
+ if ((config._doValidation == false) &&
+ ((qry->theHits[h]._status & AHIT_POLISHABLE) == 0) &&
+ ((qry->theHits[h]._status & AHIT_HAS_UNIQUE) == 0))
+ continue;
+
+
+ // If our recent success rate is pretty terrible, continue.
+ //
+ if (config._afEnabled) {
+
+ if (attempts > config._afInit) {
+ double rat = countNumberOfSetBits64(successes) / (double)((attempts < config._afLength) ? attempts : config._afLength);
+
+#if 0
+ fprintf(stderr, "autofilter: hit "uint32FMT" out of "uint32FMT" (attempts="uint32FMT") with rate %f\n",
+ h, qry->theHitsLen, attempts, rat);
+#endif
+
+ // If we've hit the end of the good polishes, give up. But
+ // still do all the stuff with unique mers in them.
+ //
+ if (((qry->theHits[h]._status & AHIT_HAS_UNIQUE) == 0) &&
+ (rat < config._afThreshold))
+ continue;
+ }
+
+ attempts++;
+ }
+
+ //
+ // Polish it up!
+ //
+
+ seqInCore *ESTseq = qry->seq;
+ seqInCore *GENseq = genome->getSequenceInCore(qry->theHits[h]._dsIdx);
+ uint32 GENlo = qry->theHits[h]._dsLo;
+ uint32 GENhi = qry->theHits[h]._dsHi;
+
+ if (GENhi > GENseq->sequenceLength())
+ GENhi = GENseq->sequenceLength();
+
+ assert(GENlo < GENhi);
+
+ bool doForward = qry->theHits[h]._status & AHIT_DIRECTION_MASK;
+ bool doReverse = !doForward;
+
+#ifdef SHOW_POLISHING
+ qry->theLog->add("Hit %u out of %u (%u -> %u[%u-%u]) dir=%c cov=%u matched=%u numMers=%u\n",
+ h, qry->theHitsLen,
+ ESTseq->getIID(),
+ qry->theHits[h]._dsIdx,
+ qry->theHits[h]._dsLo,
+ qry->theHits[h]._dsHi,
+ doForward ? 'F' : 'R',
+ qry->theHits[h]._covered,
+ qry->theHits[h]._matched,
+ qry->theHits[h]._numMers);
+#endif
+
+
+#ifdef SHOW_POLISHING_EXPENSIVE
+ double startTime = getTime();
+#endif
+
+ sim4command *P4 = new sim4command(ESTseq,
+ GENseq,
+ GENlo,
+ GENhi,
+ doForward,
+ doReverse);
+
+
+ ////////////////////////////////////////
+ //
+ // Add hits to the command
+ //
+ // addSeed() expects base-based, of the last position in
+ // the seed. We have space-based, first position. Adding
+ // the size of a mer fixes both.
+ //
+ if (doForward) {
+ for (uint32 i=0, x, y; qry->theHits[h]._ML->getMer(i, x, y); i++) {
+#ifdef SHOW_HITS_ADDED
+#ifdef SHOW_HITS_ADDED_AFTER_QUERY
+ if (ESTseq->getIID() > SHOW_HITS_ADDED_AFTER_QUERY)
+#endif
+ qry->theLog->add("FORWARDHIT GEN: hi:"uint32FMT"-lo:"uint32FMT" pos:"uint32FMT" EST: len:"uint32FMT" pos:"uint32FMT"\n",
+ GENhi, GENlo, y, (uint32)ESTseq->sequenceLength(), x);
+#endif
+ assert(y + config._KBmerSize >= GENlo);
+
+ P4->addSeed(y - GENlo + config._KBmerSize,
+ x + config._KBmerSize,
+ config._KBmerSize);
+ }
+ } else {
+ for (uint32 i=0, x, y; qry->theHits[h]._ML->getMer(i, x, y); i++) {
+#ifdef SHOW_HITS_ADDED
+#ifdef SHOW_HITS_ADDED_AFTER_QUERY
+ if (ESTseq->getIID() > SHOW_HITS_ADDED_AFTER_QUERY)
+#endif
+ qry->theLog->add("REVERSEHIT GEN: hi:"uint32FMT"-lo:"uint32FMT" pos:"uint32FMT" EST: len:"uint32FMT" pos:"uint32FMT"\n",
+ GENhi, GENlo, y, (uint32)ESTseq->sequenceLength(), x);
+#endif
+ // Original form was (GENhi-GENlo) - (y-GENlo), which
+ // reduces to the below. By reversing, we no longer need
+ // to add in the mersize, we're representing the end of
+ // the mer now!
+ //
+ assert(GENhi >= y);
+ assert(ESTseq->sequenceLength() >= x);
+
+ P4->addSeed(GENhi - y,
+ ESTseq->sequenceLength() - x,
+ config._KBmerSize);
+ }
+ }
+
+
+
+ // The main loop deletes the hits, but we take care of deleting _ML here.
+ // Maybe it should go in the destructor for the hits??
+ //
+ delete qry->theHits[h]._ML;
+ qry->theHits[h]._ML = 0L;
+
+
+ Sim4 *S4 = new Sim4(&sim4params);
+ sim4polishList *l4 = S4->run(P4);
+ sim4polishList &L4 = *l4;
+
+
+ // Clean up the matches -- remove small exons from the match,
+ // split things with big gaps into two matches.
+
+ for (uint32 i=0; L4[i]; i++) {
+
+#ifdef SHOW_MATCH_SPLITTING
+ qry->theLog->add(" match "uint32FMT" has "uint32FMT" exons.\n",
+ i, L4[i]->_numExons);
+ for (uint32 j=L4[i]->_numExons; j--; )
+ qry->theLog->add(" exon "uint32FMT" query:"uint32FMT"-"uint32FMT" genome:"uint32FMT"-"uint32FMT" id:%d nm:%d\n",
+ j,
+ L4[i]->_exons[j].estFrom,
+ L4[i]->_exons[j].estTo,
+ L4[i]->_exons[j]._genFrom,
+ L4[i]->_exons[j]._genTo,
+ L4[i]->_exons[j]._percentIdentity,
+ L4[i]->_exons[j]._numMatches);
+
+#endif
+
+ for (uint32 j=L4[i]->_numExons; j--; ) {
+ if (((L4[i]->_exons[j]._estTo - L4[i]->_exons[j]._estFrom) < config._discardExonLength) ||
+ (L4[i]->_exons[j]._percentIdentity < config._discardExonQuality)) {
+#ifdef SHOW_MATCH_SPLITTING
+ qry->theLog->add(" Deleting exon "uint32FMT" from query:"uint32FMT"-"uint32FMT" genome:"uint32FMT"-"uint32FMT"\n",
+ j,
+ L4[i]->_exons[j]._estFrom,
+ L4[i]->_exons[j]._estTo,
+ L4[i]->_exons[j]._genFrom,
+ L4[i]->_exons[j]._genTo);
+#endif
+ L4[i]->s4p_deleteExon(j);
+ }
+ }
+
+ // Copy each exon into a new match ("split things with big gaps")
+
+ while (L4[i]->_numExons > 1) {
+#ifdef SHOW_MATCH_SPLITTING
+ qry->theLog->add(" Saving exon "uint32FMT" from query:"uint32FMT"-"uint32FMT" genome:"uint32FMT"-"uint32FMT"\n",
+ L4[i]->_numExons-1,
+ L4[i]->_exons[L4[i]->_numExons-1]._estFrom,
+ L4[i]->_exons[L4[i]->_numExons-1]._estTo,
+ L4[i]->_exons[L4[i]->_numExons-1]._genFrom,
+ L4[i]->_exons[L4[i]->_numExons-1]._genTo);
+#endif
+
+ sim4polish *n = new sim4polish(L4[i], L4[i]->_numExons-1);
+ L4.push(n);
+ L4[i]->s4p_deleteExon(L4[i]->_numExons-1);
+ }
+
+ // Rebuild the stats on this guy -- we now have one exon, so just copy
+ // the exon stats to the global stats.
+
+ if (L4[i]->_numExons > 0) {
+#ifdef SHOW_MATCH_SPLITTING
+ qry->theLog->add(" Saving exon "uint32FMT" from query:"uint32FMT"-"uint32FMT" genome:"uint32FMT"-"uint32FMT"\n",
+ 0,
+ L4[i]->_exons[0]._estFrom,
+ L4[i]->_exons[0]._estTo,
+ L4[i]->_exons[0]._genFrom,
+ L4[i]->_exons[0]._genTo);
+#endif
+
+ L4[i]->_numMatches = L4[i]->_exons[0]._numMatches;
+ L4[i]->_numMatchesN = L4[i]->_exons[0]._numMatchesN;
+ L4[i]->_numCovered = L4[i]->_exons[0]._genTo - L4[i]->_exons[0]._genFrom + 1;
+ L4[i]->_percentIdentity = L4[i]->_exons[0]._percentIdentity;
+ L4[i]->_querySeqIdentity = L4[i]->s4p_percentCoverageApprox();
+ } else {
+#ifdef SHOW_MATCH_SPLITTING
+ qry->theLog->add(" All exons removed!\n");
+#endif
+ L4.remove(i);
+ i--;
+ }
+ }
+
+
+ // Even though we don't expect multiple polishes, we still have to deal with
+ // them. :-(
+
+ // Clear the 'match' flag and set qualities to zero. XXX:
+ // Again, this should be already done, but we need to guarantee
+ // it.
+ //
+ //qry->theHits[h]._status &= 0x00000003;
+ // (I guess we don't _need_ to do it....)
+
+ uint32 pi = 0;
+ uint32 pc = 0;
+
+ for (uint32 i=0; L4[i]; i++) {
+
+ // We need to remember the best pair of percent
+ // identity/coverage. These wil be stored in the hit after
+ // we process all matches.
+ //
+ if ((L4[i]->_percentIdentity >= pi) &&
+ (L4[i]->_querySeqIdentity >= pc)) {
+ pi = L4[i]->_percentIdentity;
+ pc = L4[i]->_querySeqIdentity;
+ }
+
+#ifdef SHOW_POLISHING
+ qry->theLog->add(" match["uint32FMT"] query:"uint32FMT"-"uint32FMT" genome:"uint32FMT"-"uint32FMT" id=%u cv=%d nm=%u\n",
+ i,
+ L4[i]->_exons[0]._estFrom,
+ L4[i]->_exons[0]._estTo,
+ L4[i]->_exons[0]._genFrom,
+ L4[i]->_exons[0]._genTo,
+ L4[i]->_percentIdentity,
+ L4[i]->_querySeqIdentity,
+ L4[i]->_exons[0]._numMatches);
+#endif
+
+ // If we have a real hit, set the flag and save the output
+ //
+ if ((L4[i]->_percentIdentity >= config._minMatchIdentity) &&
+ (L4[i]->_querySeqIdentity >= config._minMatchCoverage)) {
+
+ qry->theHits[h]._status |= AHIT_VERIFIED;
+
+ char *pstr = L4[i]->s4p_polishToString(sim4polishStyleDefault);
+
+ uint32 l = (uint32)strlen(pstr);
+
+ if (qry->theOutputLen + l + 1 >= qry->theOutputMax) {
+ qry->theOutputMax = qry->theOutputMax + qry->theOutputMax + l;
+ char *o = 0L;
+ try {
+ o = new char [qry->theOutputMax];
+ } catch (...) {
+ fprintf(stderr, "doPolish()-- Can't reallocate space for the output string ("uint32FMT" bytes) in thread "uint64FMT"\n", qry->theOutputMax, state->threadID);
+ abort();
+ }
+ memcpy(o, qry->theOutput, sizeof(char) * qry->theOutputLen);
+ delete [] qry->theOutput;
+ qry->theOutput = o;
+ }
+
+ memcpy(qry->theOutput + qry->theOutputLen, pstr, sizeof(char) * l);
+ qry->theOutputLen += l;
+
+ qry->theOutput[qry->theOutputLen] = 0;
+
+ delete [] pstr;
+ }
+ }
+
+ // Save the best scores
+ //
+ qry->theHits[h]._status |= pi << 16;
+ qry->theHits[h]._status |= pc << 24;
+
+ successes <<= 1;
+ if ((pi >= config._minMatchIdentity) &&
+ (pc >= config._minMatchCoverage)) {
+ //fprintf(stderr, "GOOD "uint32FMT" "uint32FMT"\n", pi, pc);
+ successes |= uint64ONE;
+ } else {
+ //fprintf(stderr, "BAD "uint32FMT" "uint32FMT"\n", pi, pc);
+ successes |= uint64ZERO;
+ }
+ successes &= successMask;
+
+ delete l4;
+ delete S4;
+ delete P4;
+
+#ifdef SHOW_POLISHING_EXPENSIVE
+ double elapsedTime = getTime() - startTime;
+ if (elapsedTime >= SHOW_POLISHING_EXPENSIVE) {
+ qry->theLog->add("Hit %u out of %u (%u -> %u[%u-%u]) took %f seconds ().\n",
+ h, qry->theHitsLen,
+ ESTseq->getIID(), GENseq->getIID(), qry->theHits[h]._dsLo, qry->theHits[h]._dsHi,
+ elapsedTime);
+ }
+#endif
+
+ delete GENseq;
+ } // over all hits
+}
diff --git a/snapper/thr-search.C b/snapper/thr-search.C
new file mode 100644
index 0000000..671da37
--- /dev/null
+++ b/snapper/thr-search.C
@@ -0,0 +1,281 @@
+#include "snapper2.H"
+
+#if defined (__SVR4) && defined (__sun)
+// Solaris defines SS in sys/regset.h
+#undef SS
+#endif
+
+class encodedQuery {
+private:
+ uint64 *_mers;
+ uint32 *_posn;
+ uint32 *_span;
+ uint32 _mersActive;
+ uint32 _mersInQuery;
+
+public:
+ encodedQuery(seqInCore *seq,
+ kMerBuilder *KB,
+ bool rc) {
+ _mers = new uint64 [seq->sequenceLength()];
+ _posn = new uint32 [seq->sequenceLength()];
+ _span = new uint32 [seq->sequenceLength()];
+ _mersActive = 0;
+ _mersInQuery = 0;
+
+ // Unfortunately, we need to use the slightly heavyweight merStream
+ // and kMerBuilder to get mers. We used to build mers in a tight
+ // loop, but with the inclusion of spacing and compression, we
+ // cannot do that anymore.
+
+ seqStream *SS = new seqStream(seq->sequence(), seq->sequenceLength());
+ merStream *MS = new merStream(KB, SS);
+ uint64 mer;
+ uint32 val;
+
+ // The rc flag tells us if we should build for the forward or
+ // reverse strand. If forward (rc == false) the mers are in the
+ // same order. If reverse, the mers are both reverse-complemented,
+ // and appear in our mers[] and skip[] lists reversed.
+
+ if (rc == false) {
+ while (MS->nextMer()) {
+ mer = MS->theFMer();
+
+ if ((maskDB && (maskDB->exists(mer) == true)) ||
+ (onlyDB && (onlyDB->exists(mer) == false)))
+ ; // Don't use it.
+ else {
+ _mers[_mersActive] = mer;
+ _posn[_mersActive] = MS->thePositionInSequence();
+ _span[_mersActive] = MS->theFMer().getMerSpan();
+ _mersActive++;
+ }
+
+ _mersInQuery++;
+ }
+ } else {
+ while (MS->nextMer()) {
+ mer = MS->theRMer();
+
+ if ((maskDB && (maskDB->exists(mer) == true)) ||
+ (onlyDB && (onlyDB->exists(mer) == false)))
+ ; // Don't use it.
+ else {
+ // We die horribly unless we do the goofy math to get the
+ // _posn. I'm sure that could be cleaned up, but it'd take
+ // more effort than I want now (being we'd have to figure
+ // out what the search/hitMatrix stuff is doing).
+ _mers[_mersActive] = mer;
+ _posn[_mersActive] = seq->sequenceLength() - MS->thePositionInSequence() - MS->theRMer().getMerSpan();
+ _span[_mersActive] = MS->theRMer().getMerSpan();
+ _mersActive++;
+ }
+
+ _mersInQuery++;
+ }
+
+ // Reverse the array -- this appears to be optional.
+#if 1
+ if (_mersActive > 0)
+ for (uint32 i=0, j=_mersActive-1; i<j; i++, j--) {
+ mer = _mers[i];
+ _mers[i] = _mers[j];
+ _mers[j] = mer;
+
+ val = _posn[i];
+ _posn[i] = _posn[j];
+ _posn[j] = val;
+
+ val = _span[i];
+ _span[i] = _span[j];
+ _span[j] = val;
+ }
+#endif
+ }
+
+ delete MS;
+ delete SS;
+ };
+
+
+ ~encodedQuery() {
+ delete [] _mers;
+ delete [] _posn;
+ delete [] _span;
+ };
+
+ uint32 numberOfMersActive(void) { return(_mersActive); };
+ uint32 numberOfMersInQuery(void) { return(_mersInQuery); };
+
+ uint64 getMer(uint32 i) { return(_mers[i]); };
+ uint32 getPosn(uint32 i) { return(_posn[i]); };
+ uint32 getSpan(uint32 i) { return(_span[i]); };
+};
+
+
+
+
+
+void
+doSearch(searcherState *state,
+ query *qry,
+ bool rc) {
+
+ if (state->KB == 0L)
+ state->KB = new kMerBuilder(config._KBmerSize,
+ config._KBcompression,
+ config._KBspacingTemplate);
+
+ encodedQuery *encqry = new encodedQuery(qry->seq, state->KB, rc);
+
+ hitMatrix *matrix = new hitMatrix(qry->seq->sequenceLength(),
+ encqry->numberOfMersInQuery(),
+ qry->seq->getIID(),
+ qry->theLog);
+
+ for (uint32 qidx=0; qidx<encqry->numberOfMersActive(); qidx++) {
+ uint64 count = 0;
+
+ if (positions->getExact(encqry->getMer(qidx), state->posn, state->posnMax, state->posnLen, count))
+ matrix->addHits(encqry->getPosn(qidx), state->posn, state->posnLen);
+ }
+
+ // Chain the hits
+ //
+ matrix->filter(rc ? 'r' : 'f', config._minHitCoverage, config._minHitLength, qry->theHits, qry->theHitsLen, qry->theHitsMax);
+
+
+ ////////////////////////////////////////
+ //
+ // Refine the hits -- if any hit looks like it contains a repeat,
+ // rebuild it using an adaptive mask threshold.
+ //
+ // We work backwards because we add on new hits to the end of our
+ // list.
+ //
+ for (uint32 h=qry->theHitsLen; h--; ) {
+
+ // The first test eliminates hits that were not generated for the
+ // complementarity used in this search (e.g., the first search
+ // does rc=forward, adds some hits, the second search does
+ // rc=reverse, and we should skip all the rc=forward hits.
+ //
+ if (((qry->theHits[h]._status & AHIT_DIRECTION_MASK) == !rc) &&
+ (qry->theHits[h]._matched > 2 * qry->theHits[h]._numMers)) {
+
+#ifdef SHOW_HIT_DISCARDING
+ qry->theLog->add("Seq "uint32FMT" Hit "uint32FMT" (%c) has "uint32FMT" matched, but only "uint32FMT" mers.\n",
+ seq->getIID(), h, rc ? 'r' : 'f', qry->theHits[h]._matched, qry->theHits[h]._numMers);
+#endif
+
+ // Grab the genomic sequence.
+ // Construct a merstream for the region.
+ // Build a positionDB of the region (both positions and counts).
+ // Fill out another hitMatrix using about 2*length mers.
+ //
+ seqInCore *GENseq = genome->getSequenceInCore(qry->theHits[h]._dsIdx);
+ uint32 GENlo = qry->theHits[h]._dsLo;
+ uint32 GENhi = qry->theHits[h]._dsHi;
+
+ merStream *MS = new merStream(state->KB,
+ new seqStream(GENseq->sequence(), GENseq->sequenceLength()),
+ false, true);
+
+ MS->setBaseRange(GENlo, GENhi);
+
+ positionDB *PS = new positionDB(MS, config._KBmerSize, 0, 0L, 0L, 0L, 0, 0, 0, 0, false);
+ hitMatrix *HM = new hitMatrix(qry->seq->sequenceLength(),
+ encqry->numberOfMersInQuery(),
+ qry->seq->getIID(),
+ qry->theLog);
+
+ // We find the number of hits we would get if we use a
+ // countLimit of i.
+ //
+#define COUNT_MAX 256
+
+ uint32 numHitsAtCount[COUNT_MAX] = { 0 };
+ uint32 countLimit = 0;
+ uint64 count = 0;
+
+ uint32 numMers = 0;
+#ifdef SHOW_HIT_DISCARDING
+ uint32 numHits = 0;
+ uint32 minNum = ~uint32ZERO;
+ uint32 maxNum = 0;
+#endif
+
+ for (uint32 qidx=0; qidx<encqry->numberOfMersActive(); qidx++) {
+ if (PS->getExact(encqry->getMer(qidx), state->posn, state->posnMax, state->posnLen, count)) {
+ numMers++;
+
+ if (state->posnLen < COUNT_MAX)
+ numHitsAtCount[state->posnLen] += state->posnLen;
+
+#ifdef SHOW_HIT_DISCARDING
+ numHits += state->posnLen;
+ if (minNum > state->posnLen) minNum = state->posnLen;
+ if (maxNum < state->posnLen) maxNum = state->posnLen;
+#endif
+ }
+ }
+
+ // Scan the number of hits at count, pick the first highest
+ // count such that the number of hits is below our threshold.
+ //
+ for (uint32 qidx=1; qidx<COUNT_MAX; qidx++) {
+ numHitsAtCount[qidx] = numHitsAtCount[qidx-1] + numHitsAtCount[qidx];
+
+ if (numHitsAtCount[qidx] <= numMers * config._repeatThreshold)
+ countLimit = qidx;
+ }
+
+#ifdef SHOW_HIT_DISCARDING
+ qry->theLog->add(" -- found "uint32FMT" hits in "uint32FMT" mers, min="uint32FMT" max="uint32FMT" avg=%.5f hits/mer.\n",
+ numHits, numMers, minNum, maxNum, (double)numHits / (double)numMers);
+ qry->theLog->add(" -- using a countLimit of "uint32FMT" which gets us "uint32FMT" mers\n",
+ countLimit, numHitsAtCount[countLimit]);
+#endif
+
+ for (uint32 qidx=0; qidx<encqry->numberOfMersActive(); qidx++) {
+ if (PS->getExact(encqry->getMer(qidx), state->posn, state->posnMax, state->posnLen, count)) {
+ if (state->posnLen <= countLimit) {
+ for (uint32 x=0; x<state->posnLen; x++)
+ state->posn[x] += genomeMap->startOf(qry->theHits[h]._dsIdx);
+
+ // The kmer counts for these mers are relative to the
+ // sub-regions, not the global, so we want to disable any
+ // filtering by kmer counts. We could add a flag to the filter
+ // to stop this, or we can reset the counts here to large
+ // values. Or we could simply reset the counts to the global
+ // value.
+ //
+ HM->addHits(encqry->getPosn(qidx), state->posn, state->posnLen, positions->countExact(encqry->getMer(qidx)));
+ }
+ }
+ }
+
+ // Chain the hits
+ //
+ HM->filter(rc ? 'r' : 'f', 0.01, 0, qry->theHits, qry->theHitsLen, qry->theHitsMax);
+
+ // Mark this hit as dead
+ //
+ qry->theHits[h]._status |= AHIT_DISCARDED;
+
+ delete HM;
+ delete PS;
+ delete MS;
+
+ delete GENseq;
+ }
+ }
+
+ delete matrix;
+ delete encqry;
+}
+
+
+
+
diff --git a/tapper/Make.include b/tapper/Make.include
new file mode 100644
index 0000000..81f5c3b
--- /dev/null
+++ b/tapper/Make.include
@@ -0,0 +1,22 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBMERYL/ :=$(realpath $/../libmeryl/)/
+LIBKMER/ :=$(realpath $/../libkmer/)/
+LIBSIM4/ :=$(realpath $/../libsim4/)/
+
+$/.CXX_SRCS := $/tagger.C $/tapper.C $/tapperconvert.C $/tappermerge.C $/tappersort.C $/tappererrorcorrect.C
+$/.CXX_EXES := $/tagger $/tapper $/tapperconvert $/tappermerge $/tappersort $/tappererrorcorrect
+
+$/.CLEAN :=$/*.o
+
+$(eval $/%.d $/%.o: CXXFLAGS+= -I${LIBUTL/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBMERYL/} -I${LIBKMER/} -I${LIBSIM4/})
+
+$/tagger: $/tagger.o ${LIBSIM4/}libsim4.a ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/tapper: $/tapper.o ${LIBSIM4/}libsim4.a ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/tapperconvert: $/tapperconvert.o ${LIBSIM4/}libsim4.a ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/tappermerge: $/tappermerge.o ${LIBSIM4/}libsim4.a ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/tappersort: $/tappersort.o ${LIBSIM4/}libsim4.a ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
+$/tappererrorcorrect: $/tappererrorcorrect.o ${LIBSIM4/}libsim4.a ${LIBKMER/}libkmer.a ${LIBMERYL/}libmeryl.a ${LIBSEQ/}libseq.a ${LIBBIO/}libbio.a ${LIBUTL/}libutil.a
diff --git a/tapper/compare.pl b/tapper/compare.pl
new file mode 100644
index 0000000..7ce35fd
--- /dev/null
+++ b/tapper/compare.pl
@@ -0,0 +1,266 @@
+#!/usr/bin/perl
+use strict;
+
+# Hack to compare tapperconvert to corona output.
+#
+# corona:
+# 926_28_374 T0233011320231302110223300 G2002221011312002112001121 1 0 1 1 1 -1809022 -1810770 AAA
+# 926_33_329 T3000003020011330112020200 G2200202231211010022113031 0 3 3 1 1 -152955 -154703 AAA
+# 926_34_440 T0121010212320132021200211 G0312100032003101301113001 2 1 3 1 1 -1618712 -1620852 AAA
+# 926_38_533 T0011012010023302310021321 G0200103332011100011013103 0 4 4 1 1 -251624 -253847 AAA
+# 926_42_329 T2331321031230210321102001 G2330012312202211022311020 1 3 4 1 1 -1093994 -1098892 AAA
+#
+# 0 beadId
+# 1 F3 sequence
+# 2 R3 sequence
+# 3 num F3 mismatches
+# 4 num R3 mismatches
+# 5 total mismatches
+# 6 F3 reference
+# 7 R3 reference
+# 8 F3 position
+# 9 R3 position
+# 10 category
+#
+#
+# tapperconvert:
+# M 12345_926_28_197 0 633481 f 0 0/0/3 54321_926_28_197 0 631038 f 0 0/0/2
+# M 12345_926_28_374 0 1808998 r 0 0/0/1 54321_926_28_374 0 1810746 r 0 0/0/0
+# M 12345_926_29_486 0 129939 f 0 0/0/2 54321_926_29_486 0 127944 f 0 0/0/4
+# M 12345_926_33_329 0 152931 r 0 0/0/0 54321_926_33_329 0 154679 r 0 0/0/3
+# M 12345_926_34_440 0 1618688 r 0 1/2/0 54321_926_34_440 0 1620828 r 0 0/0/1
+# M 12345_926_38_533 0 251600 r 0 0/0/0 54321_926_38_533 0 253823 r 0 0/0/4
+
+my $tinput = shift @ARGV;
+my $terrors = shift @ARGV;
+my $cinput = "pgingivali.F3_R3.mates";
+my $cerrors = 3;
+
+if (!defined($tinput) || !defined($terrors)) {
+ die "usage: $0 tapper-input-prefix num-errors\n";
+}
+
+print STDERR "Reading tangles.\n";
+my %tangled;
+open(TT, "./tapperconvert -dumpt $tinput |") or die;
+while (<TT>) {
+ my @v = split '\s+', $_;
+ if ($v[1] =~ m/^\d+_(\d+_\d+_\d+)$/) {
+ $v[1] = $1;
+ }
+ $tangled{$v[1]}++;
+}
+close(TT);
+
+print STDERR "Reading tapper mate for counts.\n";
+my %tcounts;
+open(TT, "./tapperconvert -dumpm $tinput |") or die;
+while (<TT>) {
+ my @v = split '\s+', $_;
+ if ($v[1] =~ m/^\d+_(\d+_\d+_\d+)$/) {
+ $v[1] = $1;
+ }
+ $tcounts{$v[1]}++;
+}
+close(TT);
+
+print STDERR "Processing.\n";
+
+open(FC, "< $cinput") or die;
+open(FT, "./tapperconvert -dumpm $tinput |") or die;
+
+open(GC, "> compare.pl.corona.out") or die;
+open(GT, "> compare.pl.tapper.out") or die;
+
+my $same = 0;
+my $qual = 0;
+my $diffmultiple = 0;
+my $diff = 0;
+
+my $onlyc = 0;
+my $onlycerror = 0;
+my $onlyctooshort = 0;
+my $onlyctoolong = 0;
+my $onlyctangled = 0;
+
+my $onlyt = 0;
+my $onlyterror = 0;
+my $onlyttooshort = 0;
+my $onlyttoolong = 0;
+my $onlytdupl = 0;
+
+my $cid = undef;
+my $cstr = undef;
+my @c;
+
+my $tid = undef;
+my $tstr = undef;
+my @t;
+
+while (!eof(FC) && !eof(FT)) {
+
+ if (!defined($cid)) {
+ $_ = <FC>;
+ while (m/^#/) {
+ $_ = <FC>;
+ }
+ my @v = split '\s+', $_;
+
+ if ($v[10] ne "AAA") {
+ goto again;
+ }
+
+ my $ori = "f";
+ if ($v[8] < 0) {
+ $v[8] = -int($v[8]);
+ $v[9] = -int($v[9]);
+ $ori = "r";
+ }
+
+ my $dist = $v[9] - $v[8];
+ if ($dist < 0) {
+ $dist = -$dist;
+ }
+
+ $cid = $v[0];
+ $cstr = "$v[0] $v[3] $v[4] $ori $v[8] $v[9] $dist";
+ $c[0] = $v[0];
+ $c[1] = $v[3];
+ $c[2] = $v[4];
+ $c[3] = $ori;
+ $c[4] = $v[8];
+ $c[5] = $v[9];
+ $c[6] = $dist;
+
+ {
+ my @xxx = split '_', $cid;
+ $xxx[0] = substr("00000$xxx[0]", -5);
+ $xxx[1] = substr("00000$xxx[1]", -5);
+ $xxx[2] = substr("00000$xxx[2]", -5);
+
+ $cid = "$xxx[0]$xxx[1]$xxx[2]";
+ }
+
+ }
+
+ if (!defined($tid)) {
+ $_ = <FT>;
+ my @v = split '\s+', $_;
+
+ if ($v[1] =~ m/^\d+_(\d+_\d+_\d+)$/) {
+ $v[1] = $1;
+ }
+ if ($v[6] =~ m!\d+/(\d+)/(\d+)$!) {
+ $v[6] = $1 + $2;
+ }
+ if ($v[12] =~ m!\d+/(\d+)/(\d+)$!) {
+ $v[12] = $1 + $2;
+ }
+
+ # Correct for reverse? Why?
+ if ($v[4] eq "r") {
+ $v[3] += 24;
+ $v[9] += 24;
+ }
+
+ my $dist = $v[9] - $v[3];
+ if ($dist < 0) {
+ $dist = -$dist;
+ }
+
+ $tid = $v[1];
+ $tstr = "$v[1] $v[6] $v[12] $v[4] $v[3] $v[9] $dist";
+ $t[0] = $v[1];
+ $t[1] = $v[6];
+ $t[2] = $v[12];
+ $t[3] = $v[4];
+ $t[4] = $v[3];
+ $t[5] = $v[9];
+ $t[6] = $dist;
+
+ {
+ my @xxx = split '\D+', $tid;
+ $xxx[0] = substr("00000$xxx[0]", -5);
+ $xxx[1] = substr("00000$xxx[1]", -5);
+ $xxx[2] = substr("00000$xxx[2]", -5);
+
+ $tid = "$xxx[0]$xxx[1]$xxx[2]";
+ }
+ }
+
+ if ($cid eq $tid) {
+ print GC "$cstr\n";
+ print GT "$tstr\n";
+
+ if ($cstr eq $tstr) {
+ $same++;
+ } elsif (($c[3] == $t[3]) && ($c[4] == $t[4]) && ($c[5] == $t[5]) && ($c[6] == $t[6])) {
+ $qual++;
+ } else {
+ #print STDERR "DIFF $cstr == $tstr\n";
+ if ($tcounts{$t[0]} > 1) {
+ $diffmultiple++;
+ } else {
+ $diff++;
+ }
+ }
+
+ undef $cid;
+ undef $cstr;
+ undef @c;
+
+ undef $tid;
+ undef $tstr;
+ undef @t;
+ } elsif ($cid lt $tid) {
+ print GC "$cstr\n";
+
+ if (($c[1] > $terrors) || ($c[2] > $terrors)) {
+ $onlycerror++;
+ } elsif ($c[6] < 1400) {
+ $onlyctooshort++;
+ } elsif ($c[6] > 2600) {
+ $onlyctoolong++;
+ } elsif (exists($tangled{$c[0]})) {
+ #print STDERR "TANGLED $cstr\n";
+ $onlyctangled++;
+ } else {
+ #print STDERR "MISSED $cstr\n";
+ $onlyc++;
+ }
+
+ undef $cid;
+ undef $cstr;
+ undef @c;
+ } else {
+ print GT "$tstr\n";
+
+ if (($t[1] > $cerrors) || ($t[2] > $cerrors)) {
+ $onlyterror++;
+ } elsif ($t[6] < 1400) {
+ $onlyttooshort++;
+ } elsif ($t[6] > 2600) {
+ $onlyttoolong++;
+ } elsif ($tcounts{$t[0]} > 1) {
+ $onlytdupl++;
+ } else {
+ $onlyt++;
+ }
+
+ undef $tid;
+ undef $tstr;
+ undef @t;
+ }
+
+again:
+}
+
+print STDERR "same $same qual $qual diff $diff diffmultiple $diffmultiple\n";
+print STDERR "onlyc $onlyc err $onlycerror short $onlyctooshort long $onlyctoolong TANGLED $onlyctangled\n";
+print STDERR "onlyt $onlyt err $onlyterror short $onlyttooshort long $onlyttoolong DUPLICATE $onlytdupl\n";
+
+close(FC);
+close(FT);
+
+close(GC);
+close(GT);
diff --git a/tapper/tagger.C b/tapper/tagger.C
new file mode 100644
index 0000000..9a45ed9
--- /dev/null
+++ b/tapper/tagger.C
@@ -0,0 +1,505 @@
+#include "tapperTag.H"
+#include "tapperResult.H"
+#include "tapperAlignment.H"
+#include "tapperHit.H"
+
+#include "seqCache.H"
+
+// Convert reads from ASCI to tapper binary.
+//
+// ASSUMPTIONS
+//
+// 1) User is smart enough to give the correct set of mated files.
+// Code doesn't check that an F tag goes with an R tag, just that the
+// tag coordinates agree. It is possible to mate an F to an F if the
+// wrong inputs are given.
+//
+// 2) Tag coords are 16-bit integers. File UIDs are 16-bit integers.
+//
+
+
+// Define this to test the encode/decode functionality.
+//#define TEST_ENCODING
+
+
+int
+tapperTagCompare(const void *a, const void *b) {
+ tapperTag const *A = (tapperTag const *)a;
+ tapperTag const *B = (tapperTag const *)b;
+ if (A->tagID() < B->tagID()) return(-1);
+ return(A->tagID() != B->tagID());
+}
+
+
+bool
+readTag(uint32 fileUID, FILE *seq, FILE *qlt, tapperTag *T) {
+ static uint16 id[4];
+ static char seqhdr[1024];
+ static char seqseq[1024];
+ static char qlthdr[1024];
+ static char qltseq[1024];
+ static uint64 qltnum[1024];
+ static splitToWords S;
+
+ seqhdr[0] = 0;
+ seqseq[0] = 0;
+ qlthdr[0] = 0;
+ qltseq[0] = 0;
+
+ if (feof(seq) || feof(qlt))
+ return(false);
+
+ fgets(seqhdr, 1024, seq);
+ while (seqhdr[0] == '#')
+ fgets(seqhdr, 1024, seq);
+ fgets(seqseq, 1024, seq);
+
+ fgets(qlthdr, 1024, qlt);
+ while (qlthdr[0] == '#')
+ fgets(qlthdr, 1024, qlt);
+ fgets(qltseq, 1024, qlt);
+
+ if ((seqhdr[0] == 0) || (qlthdr[0] == 0))
+ return(false);
+
+ chomp(seqhdr);
+ chomp(seqseq);
+ chomp(qlthdr);
+ chomp(qltseq);
+
+ if (strcmp(seqhdr, qlthdr) != 0)
+ fprintf(stderr, "WARNING: Got unpaired seq '%s' and qlt '%s'\n", seqhdr, qlthdr);
+
+ // Assumes the header is >461_28_1918_F3
+ // -- copies it to the left by one to remove the >
+ // -- the loop below doesn't move the zero-terminator
+ // -- resulting string is "461 28 1918 F33"
+ //
+ for (uint32 i=1; seqhdr[i]; i++) {
+ if (seqhdr[i] == '_')
+ seqhdr[i] = ' ';
+ seqhdr[i-1] = seqhdr[i];
+ }
+
+ S.split(seqhdr);
+
+ id[0] = fileUID;
+ id[1] = strtouint32(S[0], 0L);
+ id[2] = strtouint32(S[1], 0L);
+ id[3] = strtouint32(S[2], 0L);
+
+ S.split(qltseq);
+
+ // Not sure why there are negative numbers here, but there are.
+ //
+ for (uint32 i=0; i<S.numWords(); i++) {
+ qltnum[i] = (S[i][0] == '-') ? 0 : strtouint64(S[i], 0L);
+
+#ifdef TEST_ENCODING
+ // We need to fudge the QV's here, so our tests pass.
+ if (qltnum[i] > 31)
+ qltnum[i] = 31;
+#endif
+ }
+
+ T->encode(id, seqseq, qltnum);
+
+#ifdef TEST_ENCODING
+ {
+ uint16 it[4];
+ char seqtst[1024];
+ uint64 qlttst[1024];
+
+ T->decode(it, seqtst, qlttst);
+
+ uint32 len = strlen(seqtst);
+ uint32 fail = 0;
+ uint64 qltsum=0, tstsum=0;
+
+ for (uint32 l=0; l<len; l++) {
+ qltsum += qltnum[l];
+ tstsum += qlttst[l];
+ if ((seqseq[l] != seqtst[l]) || (qltnum[l] != qlttst[l]))
+ fail++;
+ }
+
+ if ((id[0] != it[0]) ||
+ (id[1] != it[1]) ||
+ (id[2] != it[2]) ||
+ (id[3] != it[3]) ||
+ (fail)) {
+ fprintf(stderr, "FAIL: ("uint32FMT"_"uint32FMT"_"uint32FMT"_"uint32FMT",%s,"uint64FMT") != ("uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT",%s,"uint64FMT")\n",
+ id[0], id[1], id[2], id[3], seqseq, qltsum,
+ it[0], it[1], it[2], it[3], seqtst, tstsum);
+ for (uint32 l=0; l<len; l++)
+ fprintf(stderr, " %2d -- "uint64FMT" "uint64FMT"\n", l, qltnum[l], qlttst[l]);
+ }
+ }
+#endif
+
+ return(true);
+}
+
+
+
+void
+dumpTagFileStats(char *tagfile) {
+ tapperTagFile *TF = new tapperTagFile(tagfile, 'r');
+
+ if (TF->metaData()->isPairedTagFile()) {
+ fprintf(stdout, "%s\ttype\tmated tags\n", tagfile);
+ fprintf(stdout, "%s\tlength\t"uint32FMT"\n", tagfile, TF->metaData()->tagSize());
+ fprintf(stdout, "%s\tnumMates\t"uint64FMT"\n", tagfile, TF->numberOfMatePairs());
+ fprintf(stdout, "%s\tmean\t"uint32FMT"\n", tagfile, TF->metaData()->mean());
+ fprintf(stdout, "%s\tstddev\t"uint32FMT"\n", tagfile, TF->metaData()->stddev());
+ } else {
+ fprintf(stdout, "%s\ttype\tfragment tags\n", tagfile);
+ fprintf(stdout, "%s\tlength\t"uint32FMT"\n", tagfile, TF->metaData()->tagSize());
+ fprintf(stdout, "%s\tnumTags\t"uint64FMT"\n", tagfile, TF->numberOfFragmentTags());
+
+ }
+}
+
+
+void
+dumpTagFile(char *tagfile) {
+ tapperTagFile *TF = new tapperTagFile(tagfile, 'r');
+ tapperTag a, b;
+ uint16 ida[4], idb[4];
+ char seqa[265], seqb[256];
+ char quaa[256], quab[256];
+ uint64 qvsa[256], qvsb[256];
+ uint32 i;
+
+ if (TF->metaData()->isPairedTagFile()) {
+ while (TF->get(&a, &b)) {
+ a.decode(ida, seqa, qvsa);
+ b.decode(idb, seqb, qvsb);
+ for (i=0; seqa[i+1]; i++)
+ quaa[i] = qvsa[i] + '0';
+ for (i=0; seqb[i+1]; i++)
+ quab[i] = qvsb[i] + '0';
+ fprintf(stdout, ">"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t%s/%s\t>"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t%s/%s\n",
+ ida[0], ida[1], ida[2], ida[3], seqa, quaa,
+ idb[0], idb[1], idb[2], idb[3], seqb, quab);
+ }
+ } else {
+ while (TF->get(&a)) {
+ a.decode(ida, seqa, qvsa);
+ for (i=0; seqa[i+1]; i++)
+ quaa[i] = qvsa[i] + '0';
+ fprintf(stdout, ">"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t%s/%s\n",
+ ida[0], ida[1], ida[2], ida[3], seqa, quaa);
+ }
+ }
+
+ delete TF;
+}
+
+
+
+int
+main(int argc, char **argv) {
+ char *prefix = 0L;
+
+ uint32 sampleSize = 0;
+ char *sampleFile = 0L;
+ uint32 sampleErrors = 3;
+ uint32 sampleTagSize = 25;
+
+ uint32 tagfuid = 0, tagruid = 0;
+ char *tagfseq = 0L, *tagrseq = 0L;
+ char *tagfqlt = 0L, *tagrqlt = 0L;
+
+ uint32 mean=0, stddev=0;
+
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-tagout", 5) == 0) {
+ prefix = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-tags", 5) == 0) {
+ tagfuid = strtouint32(argv[++arg], 0L);
+ tagfseq = argv[++arg];
+ tagfqlt = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-ftags", 2) == 0) {
+ tagfuid = strtouint32(argv[++arg], 0L);
+ tagfseq = argv[++arg];
+ tagfqlt = argv[++arg];
+ } else if (strncmp(argv[arg], "-rtags", 2) == 0) {
+ tagruid = strtouint32(argv[++arg], 0L);
+ tagrseq = argv[++arg];
+ tagrqlt = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-insertsize", 2) == 0) {
+ mean = strtouint32(argv[++arg], 0L);
+ stddev = strtouint32(argv[++arg], 0L);
+
+ if (mean > MAX_INSERT_SIZE)
+ fprintf(stderr, "%s: insert size limited to at most %dbp.\n", argv[0], MAX_INSERT_SIZE), exit(1);
+ if (stddev > MAX_INSERT_DEVIATION)
+ fprintf(stderr, "%s: insert size limited to at most +- %dbp.\n", argv[0], MAX_INSERT_DEVIATION), exit(1);
+
+ } else if (strcmp(argv[arg], "-sample") == 0) {
+ sampleSize = strtouint32(argv[++arg], 0L);
+ sampleFile = argv[++arg];
+ } else if (strcmp(argv[arg], "-sampleerrors") == 0) {
+ sampleErrors = strtouint32(argv[++arg], 0L);
+ } else if (strcmp(argv[arg], "-sampletagsize") == 0) {
+ sampleTagSize = strtouint32(argv[++arg], 0L);
+
+ } else if (strncmp(argv[arg], "-stats", 3) == 0) {
+ dumpTagFileStats(argv[++arg]);
+ exit(0);
+
+ } else if (strncmp(argv[arg], "-dump", 2) == 0) {
+ dumpTagFile(argv[++arg]);
+ exit(0);
+
+ } else {
+ err++;
+ }
+ arg++;
+ }
+ if (sampleFile == 0L) {
+ if ((tagfseq == 0L) || (tagfqlt == 0L)) err++;
+ if ((tagfseq != 0L) && (tagfqlt == 0L)) err++;
+ if ((tagfseq == 0L) && (tagfqlt != 0L)) err++;
+ }
+ if ((err) || (prefix == 0L)) {
+ fprintf(stderr, "usage: %s -tagout prefix -tags fileUID xx.csfasta xx.qual\n", argv[0]);
+ fprintf(stderr, "usage: %s -tagout prefix -ftags fileUID ff.csfasta ff.qual -rtags fileUID rr.csfasta rr.qual\n", argv[0]);
+ fprintf(stderr, "usage: %s -dump file.tapperTags\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "unmated tags will be placed in 'prefix.frag.tapperTags'\n");
+ fprintf(stderr, " mated tags will be placed in 'prefix.mate.tapperTags'\n");
+ exit(1);
+ }
+
+ uint64 numTagsF = 0, maxTagsF = 0;
+ uint64 numTagsR = 0, maxTagsR = 0;
+ uint64 numTagsM = 0;
+
+ tapperTag *TF = 0L;
+ tapperTag *TR = 0L;
+
+ // If given a sampleFile, generate some tags from there.
+ if (sampleFile) {
+ seqCache *F = new seqCache(sampleFile);
+ seqInCore *s = F->getSequenceInCore();
+
+ uint32 pos = 0;
+ uint32 len = s->sequenceLength();
+
+ uint16 id[4];
+ char cor[64] = {0};
+ char seq[64] = {0};
+ uint64 qlt[64] = {0};
+
+ char acgt[4] = {'A', 'C', 'G', 'T'};
+
+ mt_s *mtctx = mtInit(time(0));
+
+ maxTagsF = sampleSize;
+ TF = new tapperTag [maxTagsF];
+
+ maxTagsR = sampleSize;
+ TR = new tapperTag [maxTagsR];
+
+ for (uint32 i=0; i<sampleSize; i++) {
+ pos = mtRandom32(mtctx) % (len - sampleTagSize);
+
+ char n = acgt[mtRandom32(mtctx) % 4];
+ char l = n;
+
+ cor[0] = n;
+ seq[0] = n;
+
+ bool doForward = (mtRandom32(mtctx) & 0x1000) == 0x1000;
+ //doForward = false;
+
+ if (doForward) {
+ uint32 sp = pos;
+ for (uint32 x=1; x<=sampleTagSize; x++) {
+ n = s->sequence()[sp++];
+ cor[x] = n;
+ seq[x] = baseToColor[l][n];
+ l = n;
+ }
+ } else {
+ uint32 sp = pos + sampleTagSize - 1;
+ for (uint32 x=1; x<=sampleTagSize; x++) {
+ n = complementSymbol[s->sequence()[sp--]];
+ cor[x] = n;
+ seq[x] = baseToColor[l][n];
+ l = n;
+ }
+ }
+
+ // Insert errors.
+
+ char errors[256] = {0};
+ char errort[256] = {0};
+ uint32 nerrs = mtRandom32(mtctx) % (sampleErrors + 1);
+
+ for (uint32 xx=0; xx<nerrs; xx++) {
+ uint32 e = mtRandom32(mtctx) % (sampleTagSize-1) + 1;
+ char o = seq[e];
+ seq[e] = seq[e] + 1;
+ if (seq[e] > '3')
+ seq[e] = '0';
+ sprintf(errort, "\t%c->%c@%02d", o, seq[e], e);
+ strcat(errors, errort);
+ }
+
+ id[0] = i;
+ id[1] = 0;
+ id[2] = 0;
+ id[3] = 0;
+
+ fprintf(stdout, "F\t"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t0\t"uint32FMT"\t%c\t%s%s\t%s\n",
+ id[0], id[1], id[2], id[3],
+ pos,
+ (doForward) ? 'f' : 'r',
+ cor+1,
+ errors,
+ seq);
+
+ // TF is NOT just storing the 'forward' reads, it's all the
+ // reads from the first half of the mate. Since we're not
+ // mated, this is just all reads.
+
+ TF[numTagsF++].encode(id, seq, qlt);
+ }
+ }
+
+ //
+ // Suck in all the F tags.
+ //
+ if (tagfseq) {
+ FILE *fseq = fopen(tagfseq, "r");
+ FILE *fqlt = fopen(tagfqlt, "r");
+
+ speedCounter *CT = new speedCounter(" reading F tags %7.0f sequences -- %5.0f sequences/second\r", 1.0, 0x1ffff, true);
+
+ maxTagsF = sizeOfFile(tagfseq) / 44 + 1000000;
+ TF = new tapperTag [maxTagsF];
+
+ while (readTag(tagfuid, fseq, fqlt, TF + numTagsF)) {
+ numTagsF++;
+ if (numTagsF >= maxTagsF)
+ fprintf(stderr, "Too many F tags. Boom.\n"), exit(1);
+ CT->tick();
+ }
+ delete CT;
+
+ fclose(fseq);
+ fclose(fqlt);
+ }
+
+ //
+ // Suck in all the R tags.
+ //
+ if (tagrseq) {
+ FILE *rseq = fopen(tagrseq, "r");
+ FILE *rqlt = fopen(tagrqlt, "r");
+
+ speedCounter *CT = new speedCounter(" reading R tags %7.0f sequences -- %5.0f sequences/second\r", 1.0, 0x1ffff, true);
+
+ maxTagsR = sizeOfFile(tagrseq) / 44 + 1000000;
+ TR = new tapperTag [maxTagsR];;
+
+ while (readTag(tagruid, rseq, rqlt, TR + numTagsR)) {
+ numTagsR++;
+ if (numTagsR >= maxTagsR)
+ fprintf(stderr, "Too many R tags. Boom.\n"), exit(1);
+ CT->tick();
+ }
+ delete CT;
+
+ fclose(rseq);
+ fclose(rqlt);
+ }
+
+ maxTagsF = numTagsF;
+ numTagsF = 0;
+
+ maxTagsR = numTagsR;
+ numTagsR = 0;
+
+ //
+ // Sort them.
+ //
+ qsort_mt(TF, maxTagsF, sizeof(tapperTag), tapperTagCompare, 4, 4 * 1024 * 1024);
+ qsort_mt(TR, maxTagsR, sizeof(tapperTag), tapperTagCompare, 4, 4 * 1024 * 1024);
+
+ //
+ // Merge to find pairs, output.
+ //
+ char fragout[FILENAME_MAX];
+ char mateout[FILENAME_MAX];
+
+ sprintf(fragout, "%s.frag.tapperTags", prefix);
+ sprintf(mateout, "%s.mate.tapperTags", prefix);
+
+ tapperTagFile *TOfrag = 0L;
+ tapperTagFile *TOmate = 0L;
+
+ speedCounter *CF = new speedCounter(" writing frag tags %7.0f sequences -- %5.0f sequences/second\r", 1.0, 0x1ffff, true);
+ speedCounter *CM = new speedCounter(" writing mate tags %7.0f sequences -- %5.0f sequences/second\r", 1.0, 0x1ffff, true);
+
+ while ((numTagsF < maxTagsF) && (numTagsR < maxTagsR)) {
+ uint64 fID = TF[numTagsF].tagID() & uint64MASK(48);
+ uint64 rID = TR[numTagsR].tagID() & uint64MASK(48);
+
+ if (fID == rID) {
+ if (TOmate == 0L)
+ TOmate = new tapperTagFile(mateout, 'w');
+ TOmate->put(TF + numTagsF, TR + numTagsR);
+ numTagsF++;
+ numTagsR++;
+ numTagsM++;
+ CM->tick();
+ } else if (fID < rID) {
+ if (TOfrag == 0L)
+ TOfrag = new tapperTagFile(fragout, 'w');
+ TOfrag->put(TF + numTagsF);
+ numTagsF++;
+ CF->tick();
+ } else {
+ if (TOfrag == 0L)
+ TOfrag = new tapperTagFile(fragout, 'w');
+ TOfrag->put(TR + numTagsR);
+ numTagsR++;
+ CF->tick();
+ }
+ }
+ while (numTagsF < maxTagsF) {
+ if (TOfrag == 0L)
+ TOfrag = new tapperTagFile(fragout, 'w');
+ TOfrag->put(TF + numTagsF);
+ numTagsF++;
+ CF->tick();
+ }
+ while (numTagsR < maxTagsR) {
+ if (TOfrag == 0L)
+ TOfrag = new tapperTagFile(fragout, 'w');
+ TOfrag->put(TR + numTagsR);
+ numTagsR++;
+ CF->tick();
+ }
+
+ delete CF;
+ delete CM;
+
+ if (TOmate)
+ TOmate->metaData()->setMeanStdDev(mean, stddev);
+
+ delete TOmate;
+ delete TOfrag;
+
+ delete [] TR;
+ delete [] TF;
+}
diff --git a/tapper/tapper.C b/tapper/tapper.C
new file mode 100644
index 0000000..ca5b44a
--- /dev/null
+++ b/tapper/tapper.C
@@ -0,0 +1,1157 @@
+#include "tapperTag.H"
+#include "tapperResult.H"
+#include "tapperAlignment.H"
+#include "tapperHit.H"
+#include "tapperGlobalData.H"
+#include "tapperThreadData.H"
+#include "tapperComputation.H"
+
+#undef VERBOSEWORKER
+
+// Very expensive. Compare the obvious O(n^2) happy mate finding
+// algorithm against the O(n) algorithm.
+//
+#undef DEBUG_MATES
+
+
+void*
+tapperReader(void *G) {
+ tapperGlobalData *g = (tapperGlobalData *)G;
+ tapperComputation *s = 0L;
+ tapperTag a, b;
+
+ if (g->TF->metaData()->isPairedTagFile()) {
+ if (g->TF->get(&a, &b))
+ s = new tapperComputation(&a, &b);
+ } else {
+ if (g->TF->get(&a))
+ s = new tapperComputation(&a, 0L);
+ }
+
+ return(s);
+}
+
+
+
+void
+tapperWriter(void *G, void *S) {
+ tapperGlobalData *g = (tapperGlobalData *)G;
+ tapperComputation *s = (tapperComputation *)S;
+ tapperResultIndex result;
+
+ // Build the result index.
+
+ result._tag1id = s->tag1id;
+ result._tag2id = s->tag2id;
+
+ result._maxColrMismatchMapped = g->maxColorError;
+ result._maxBaseMismatchMapped = g->maxBaseError;
+
+ result._mean = g->TF->metaData()->mean();
+ result._stddev = g->TF->metaData()->stddev();
+
+ if (s->resultFragmentLen > g->repeatThreshold) {
+ result._numFrag = 0;
+ result._numFragDiscarded = s->resultFragmentLen;
+ } else {
+ result._numFrag = s->resultFragmentLen;
+ result._numFragDiscarded = 0;
+ }
+
+ result._numFragSingleton = s->resultSingletonLen;
+ result._numFragTangled = s->resultTangledAlignmentLen;
+ result._numMated = s->resultMatedLen;
+ result._numTangled = s->resultTangledLen;
+
+ result._pad1 = 0;
+ result._pad2 = 0;
+
+ // Now write.
+
+ g->TA->write(&result,
+ s->resultFragment,
+ s->resultSingleton,
+ s->resultTangledAlignment,
+ s->resultMated,
+ s->resultTangled,
+ s->alignQualHistogram);
+
+ delete s;
+}
+
+
+
+
+// Compose the colors from beg to end.
+//
+inline
+char
+composeColors(char *colors, uint32 beg, uint32 end) {
+ char c = colors[beg];
+
+ for (uint32 x=beg; x<end; x++)
+ c = baseToColor[c][colors[x]];
+
+ return(c);
+}
+
+
+
+// Returns true if the the i and j errors result in a consistent
+// encoding, and they're not too far away. Consistent in that the
+// sequence before agrees and the sequence after agrees.
+//
+inline
+bool
+isConsistent(char *ref, char *tag,
+ uint32 i, uint32 j) {
+ return(composeColors(ref, i, j) == composeColors(tag, i, j));
+}
+
+
+
+// Analyze tag[] and ref[], correct differences, call base changes.
+// Return an ACGT sequence for the tag.
+//
+// Compose the colors together. At points where the compositions
+// disagree, the base at that point is different. The composition
+// tells us how to transform the reference letter to the base at
+// this position, in one step.
+//
+// If our final composed value is different, then either we end on
+// a SNP, or we have an error somewhere. The choice here is
+// arbitrary, and made depending on where that error is.
+//
+bool
+tapperHit::alignToReference(tapperGlobalData *g,
+ uint32 so_in,
+ uint32 po_in,
+ char *tag_in, uint32 len_in) {
+
+ // This function is NOT a bottleneck. Don't bother optimizing.
+
+ // so_in and po_in are the sequence iid and position in that
+ // sequence where the tag maps.
+ //
+ // tag_in is the full tag with reference base at start or end,
+ // either T010203010331 or 01031031033G. len_in is the length of
+ // the COLOR CALLS in tag_in, NOT the strlen of it.
+ //
+
+ uint32 errs = 0; // number of errors
+ uint32 errp[TAG_LEN_MAX]; // location of the errors
+ uint32 errc[TAG_LEN_MAX]; // status of confirmed or error
+
+ char _tagCOREC[TAG_LEN_MAX]; // For holding corrected color calls, only to generate ACGT align
+
+ _seqIdx = so_in;
+ _seqPos = po_in;
+ _tagIdx = 0;
+
+ // _rev: Yeah, we assume ASCII and UNIX newlines all over the
+ // place. A forward read starts with a reference base; reverse
+ // reads have a number here.
+ //
+ // _len -- the length of the tag + reference base.
+ // -- number of color calls / ACGT + 1.
+ //
+ _pad = 0;
+ _len = len_in + 1;
+ _rev = (tag_in[0] < 'A') ? true : false;
+
+ _basesMismatch = len_in; // Set at end
+
+ _colorMismatch = 0; // Set when parsing errors
+ _colorInconsistent = 0; // Set when parsing errors
+
+ _tagCOLOR[0] = 0;
+ _tagCOREC[0] = 0;
+ _refCOLOR[0] = 0;
+
+ _tagACGT[0] = 0;
+ _refACGT[0] = 0;
+
+
+ // Copy the tag.
+ //
+ // A bit of devilish trickery to make a reverse read look like a
+ // forward read - we locally reverse the reference and read,
+ // process as if the reverse read is a forward read, then clean up
+ // at the end. See tapperComputation.H for what is in the reverse
+ // tag.
+ //
+ {
+ if (_rev) {
+ for (uint32 i=0, j=_len-1; i<_len; i++, j--)
+ _tagCOLOR[i] = _tagCOREC[i] = tag_in[j];
+ _tagCOLOR[0] = _tagCOREC[0] = complementSymbol[_tagCOLOR[0]];
+ } else {
+ for (uint32 i=0; i<_len; i++)
+ _tagCOLOR[i] = _tagCOREC[i] = tag_in[i];
+ }
+ _tagCOLOR[_len] = 0;
+ _tagCOREC[_len] = 0;
+ }
+
+
+ // Copy the reference and convert the genomic sequence to
+ // color space using the reference base of the read.
+ //
+ {
+ char *seq = g->GS->getSequenceInCore(so_in)->sequence();
+
+ strncpy(_refACGT, seq + po_in, _len-1);
+ _refACGT[_len-1] = 0;
+
+ if (_rev)
+ reverseComplementSequence(_refACGT, _len-1);
+
+ _refCOLOR[0] = _tagCOLOR[0]; // ALWAYS the reference encoding base, as long as we copy the tag first.
+ _refCOLOR[1] = baseToColor[_refCOLOR[0]][_refACGT[0]];
+
+ for (uint32 ti=2; ti<_len; ti++)
+ _refCOLOR[ti] = baseToColor[_refACGT[ti-2]][_refACGT[ti-1]];
+
+ _refCOLOR[_len] = 0;
+ }
+
+ //fprintf(stderr, "tag: %s %s ref: %s %s\n", tag_in, _tagCOLOR, _refCOLOR, _refACGT);
+
+ // Count the number of color space errors
+ //
+ // Note that errp[] is actaully 1-based; the first position is
+ // never an error; it's the reference base.
+
+ for (uint32 ti=1; ti<_len; ti++) {
+ if (_tagCOLOR[ti] != _refCOLOR[ti]) {
+ errp[errs] = ti;
+ errc[errs] = 0;
+ errs++;
+ }
+ }
+
+ //
+ // The following if blocks correct single color errors using very
+ // complicated rules.
+ //
+
+
+ if (errs == 0) {
+ _colorMismatch = 0;
+ _colorInconsistent = 0;
+
+ } else if (errs == 1) {
+ // Always corrected, just to get an ACGT alignment. We can't
+ // tell if the color mismatch is an error, or if the error is
+ // adjacent to the mismatch, which would have resulted in a valid
+ // SNP.
+ _colorMismatch = 0;
+ _colorInconsistent = 1;
+ _tagCOREC[errp[0]] = _refCOLOR[errp[0]];
+
+ } else if (errs == 2) {
+ bool ok21 = isConsistent(_refCOLOR, _tagCOLOR, 1, _len) && (errp[1] - errp[0] < 4);
+
+ if (ok21) {
+ // MNP of size 4.
+ _colorMismatch = 2;
+ _colorInconsistent = 0;
+ errc[0] = 1;
+ errc[1] = 1;
+ } else {
+ // Correct 'em.
+ _colorMismatch = 0;
+ _colorInconsistent = 2;
+ _tagCOREC[errp[0]] = _refCOLOR[errp[0]];
+ _tagCOREC[errp[1]] = _refCOLOR[errp[1]];
+ }
+
+ } else if (errs == 3) {
+ bool ok21 = isConsistent(_refCOLOR, _tagCOLOR, 1, errp[2]) && (errp[1] - errp[0] < 4);
+ bool ok22 = isConsistent(_refCOLOR, _tagCOLOR, errp[0]+1, _len) && (errp[2] - errp[1] < 4);
+
+ bool ok31 = isConsistent(_refCOLOR, _tagCOLOR, 1, _len) && (errp[2] - errp[0] < 5);
+
+ if (ok31) {
+ // MNP of size 5
+ _colorMismatch = 3;
+ _colorInconsistent = 0;
+ errc[0] = 1;
+ errc[1] = 1;
+ errc[2] = 1;
+ } else if (ok21) {
+ // First two ok, fix the third.
+ _colorMismatch = 2;
+ _colorInconsistent = 1;
+ _tagCOREC[errp[2]] = _refCOLOR[errp[2]];
+ errc[0] = 1;
+ errc[1] = 1;
+ } else if (ok22) {
+ // Last two ok, fix the first.
+ _colorMismatch = 2;
+ _colorInconsistent = 1;
+ _tagCOREC[errp[0]] = _refCOLOR[errp[0]];
+ errc[1] = 1;
+ errc[2] = 1;
+ } else {
+ // Nothing consistent, fix all of 'em.
+ _colorMismatch = 0;
+ _colorInconsistent = 3;
+ _tagCOREC[errp[0]] = _refCOLOR[errp[0]];
+ _tagCOREC[errp[1]] = _refCOLOR[errp[1]];
+ _tagCOREC[errp[2]] = _refCOLOR[errp[2]];
+ }
+
+ } else if (errs == 4) {
+ bool ok21 = isConsistent(_refCOLOR, _tagCOLOR, 1, errp[2]) && (errp[1] - errp[0] < 4);
+ bool ok22 = isConsistent(_refCOLOR, _tagCOLOR, errp[0]+1, errp[2]) && (errp[2] - errp[1] < 4);
+ bool ok23 = isConsistent(_refCOLOR, _tagCOLOR, errp[1]+1, _len) && (errp[3] - errp[2] < 4);
+
+ bool ok31 = isConsistent(_refCOLOR, _tagCOLOR, 1, errp[3]) && (errp[2] - errp[0] < 5);
+ bool ok32 = isConsistent(_refCOLOR, _tagCOLOR, errp[0]+1, _len) && (errp[3] - errp[1] < 5);
+
+ bool ok41 = isConsistent(_refCOLOR, _tagCOLOR, 1, _len) && (errp[3] - errp[0] < 6);
+
+ // With two exceptions, exactly one of the ok's will be true.
+ // The exceptions are:
+ //
+ // a) ok21 and ok23 will imply ok41. However there is nothing to
+ // correct here. We just need to make sure that we stop
+ // processing rules on ok41.
+ //
+ // b) ok41 and ok22. Not sure if this can ever happen, but like
+ // case a, we're ok if we stop after ok41.
+ //
+
+ if (ok41) {
+ // MNP of size 6
+ _colorMismatch = 4;
+ _colorInconsistent = 0;
+ errc[0] = 1;
+ errc[1] = 1;
+ errc[2] = 1;
+ errc[3] = 1;
+ } else if (ok31) {
+ // First three ok, fix the last one.
+ _colorMismatch = 3;
+ _colorInconsistent = 1;
+ _tagCOREC[errp[3]] = _refCOLOR[errp[3]];
+ errc[0] = 1;
+ errc[1] = 1;
+ errc[2] = 1;
+ } else if (ok32) {
+ // Last three ok, fix the first one.
+ _colorMismatch = 3;
+ _colorInconsistent = 1;
+ _tagCOREC[errp[0]] = _refCOLOR[errp[0]];
+ errc[1] = 1;
+ errc[2] = 1;
+ errc[3] = 1;
+ } else if (ok21) {
+ // First two ok, fix the last two.
+ _colorMismatch = 2;
+ _colorInconsistent = 2;
+ _tagCOREC[errp[2]] = _refCOLOR[errp[2]];
+ _tagCOREC[errp[3]] = _refCOLOR[errp[3]];
+ errc[0] = 1;
+ errc[1] = 1;
+ } else if (ok22) {
+ // Middle two ok, fix the outties.
+ _colorMismatch = 2;
+ _colorInconsistent = 2;
+ _tagCOREC[errp[0]] = _refCOLOR[errp[0]];
+ _tagCOREC[errp[3]] = _refCOLOR[errp[3]];
+ errc[1] = 1;
+ errc[2] = 1;
+ } else if (ok23) {
+ // Last two ok, fix the first two.
+ _colorMismatch = 2;
+ _colorInconsistent = 2;
+ _tagCOREC[errp[0]] = _refCOLOR[errp[0]];
+ _tagCOREC[errp[1]] = _refCOLOR[errp[1]];
+ errc[2] = 1;
+ errc[3] = 1;
+ } else {
+ // Nothing consistent, fix all of 'em.
+ _colorMismatch = 0;
+ _colorInconsistent = 4;
+ _tagCOREC[errp[0]] = _refCOLOR[errp[0]];
+ _tagCOREC[errp[1]] = _refCOLOR[errp[1]];
+ _tagCOREC[errp[2]] = _refCOLOR[errp[2]];
+ _tagCOREC[errp[3]] = _refCOLOR[errp[3]];
+ }
+ } else if (errs == 5) {
+ //fprintf(stderr, "Five errors detected. Code doesn't know what to do.\n");
+ _colorMismatch = 0;
+ _colorInconsistent = 5;
+ } else if (errs == 6) {
+ //fprintf(stderr, "Six errors detected. Code doesn't know what to do.\n");
+ _colorMismatch = 0;
+ _colorInconsistent = 6;
+ } else {
+ //fprintf(stderr, "Wow, you got a lot of errors. Code doesn't know what to do.\n");
+ _colorMismatch = 0;
+ _colorInconsistent = errs;
+ }
+
+ // Too many errors already? Fail.
+ //
+ if (_colorMismatch + _colorInconsistent > g->maxColorError)
+ return(false);
+
+ // Compute alignments of corrected color strings.
+
+ _basesMismatch = 0;
+
+ _tagACGT[0] = baseToColor[_tagCOREC[0]][_tagCOREC[1]];
+ _refACGT[0] = baseToColor[_refCOLOR[0]][_refCOLOR[1]];
+ for (uint32 ti=1; ti<_len; ti++) {
+ _tagACGT[ti] = baseToColor[_tagACGT[ti-1]][_tagCOREC[ti+1]];
+ _refACGT[ti] = baseToColor[_refACGT[ti-1]][_refCOLOR[ti+1]];
+ }
+ _tagACGT[_len-1] = 0;
+ _refACGT[_len-1] = 0;
+
+ for (uint32 ti=0; ti<_len-1; ti++) {
+ if (_tagACGT[ti] != _refACGT[ti]) {
+ _basesMismatch++;
+
+ _tagACGT[ti] = toUpper[_tagACGT[ti]];
+ _refACGT[ti] = toUpper[_refACGT[ti]];
+ }
+ }
+
+ if (_rev) {
+ // Undo the tag and ref reversals.
+ _tagCOLOR[0] = complementSymbol[_tagCOLOR[0]];
+ reverseString(_tagCOLOR, _len);
+
+ _tagCOREC[0] = complementSymbol[_tagCOREC[0]];
+ reverseString(_tagCOREC, _len);
+
+ _refCOLOR[0] = complementSymbol[_refCOLOR[0]];
+ reverseString(_refCOLOR, _len);
+
+ // Reverse complement the alignments
+
+ reverseComplementSequence(_tagACGT, _len-1);
+ reverseComplementSequence(_refACGT, _len-1);
+
+ // Adjust the error positions...once we start caring about positions.
+
+ for (uint32 x=0; x<errs; x++)
+ errp[x] = _len - errp[x];
+ }
+
+ // Too much ACGT difference? Fail.
+ //
+ if (_basesMismatch > g->maxBaseError)
+ return(false);
+
+ //fprintf(stderr, "tag: %s %s ref: %s %s "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ // tag_in, _tagCOLOR, _refCOLOR, _refACGT, _basesMismatch, _colorMismatch, _colorInconsistent);
+
+ // Stuff the errors into the hit.
+
+ uint32 nn = 0;
+
+ for (uint32 x=0; x<errs; x++)
+ if (errc[x] == 1)
+ _tagColorDiffs[nn++] = (letterToBits[ _tagCOLOR[ errp[x] ] ] << 6) | errp[x];
+
+ assert(nn == _colorMismatch);
+
+ for (uint32 x=0; x<errs; x++)
+ if (errc[x] == 0)
+ _tagColorDiffs[nn++] = (letterToBits[ _tagCOLOR[ errp[x] ] ] << 6) | errp[x];
+
+ assert(nn == _colorMismatch + _colorInconsistent);
+
+ return(true);
+}
+
+
+
+// The big value of this function is to convert from a chained
+// position to a (seqID,pos), and save the hit onto a bitpacked list.
+// This list can then be numerically sorted to order all hits. Of
+// course, we could have just sorted the original chained positions.
+//
+// It saves (seqID,pos,isTag2,isReverse)
+//
+inline
+void
+tapperWorker_addHits(uint64 *posn, uint64 posnLen,
+ tapperGlobalData *g,
+ tapperComputation *s,
+ bool rev,
+ bool tag1) {
+ tapperHit h;
+ char *tagseq;
+ uint32 taglen;
+
+ if (tag1) {
+ tagseq = (rev) ? s->tag1rseq : s->tag1fseq;
+ taglen = s->tag1size;
+ } else {
+ tagseq = (rev) ? s->tag2rseq : s->tag2fseq;
+ taglen = s->tag2size;
+ }
+
+ for (uint32 i=0; i<posnLen; i++) {
+ uint64 pos = posn[i];
+ uint64 seq = g->SS->sequenceNumberOfPosition(pos);
+
+ pos -= g->SS->startOf(seq);
+ seq = g->SS->IIDOf(seq);
+
+ // Search ignores first letter, align needs it. This makes for a
+ // very special case, 0, which isn't a full match.
+
+ if (pos > 0) {
+ pos--;
+
+ if (h.alignToReference(g, seq, pos, tagseq, taglen) == true)
+ s->addHit(g, h, tag1);
+ }
+ }
+}
+
+
+void
+tapperWorker(void *G, void *T, void *S) {
+ tapperGlobalData *g = (tapperGlobalData *)G;
+ tapperThreadData *t = (tapperThreadData *)T;
+ tapperComputation *s = (tapperComputation *)S;
+
+ //
+ // Get the hits.
+ //
+
+#ifdef VERBOSEWORKER
+ fprintf(stderr, "GET HITS %s %s.\n", s->tag1fseq, s->tag2fseq);
+#endif
+
+ t->posn1fLen = t->posn1rLen = t->posn2fLen = t->posn2rLen = 0;
+
+ if (s->tag1size > 0) {
+ g->PS->getUpToNMismatches(s->tag1f, g->maxColorError, t->posn1f, t->posn1fMax, t->posn1fLen);
+ g->PS->getUpToNMismatches(s->tag1r, g->maxColorError, t->posn1r, t->posn1rMax, t->posn1rLen);
+ }
+
+ if (s->tag2size > 0) {
+ g->PS->getUpToNMismatches(s->tag2f, g->maxColorError, t->posn2f, t->posn2fMax, t->posn2fLen);
+ g->PS->getUpToNMismatches(s->tag2r, g->maxColorError, t->posn2r, t->posn2rMax, t->posn2rLen);
+ }
+
+ // Quit if nothing there.
+
+ if (t->posn1fLen + t->posn1rLen + t->posn2fLen + t->posn2rLen == 0)
+ return;
+
+#ifdef VERBOSEWORKER
+ fprintf(stderr, " raw hits: "uint64FMT" "uint64FMT" "uint64FMT" "uint64FMT"\n",
+ t->posn1fLen, t->posn1rLen, t->posn2fLen, t->posn2rLen);
+#endif
+
+ //
+ // Align to reference to get rid of the 3/4 false hits.
+ //
+
+#ifdef VERBOSEWORKER
+ fprintf(stderr, "ALIGN TO REFERENCE.\n");
+#endif
+
+ tapperWorker_addHits(t->posn1f, t->posn1fLen, g, s, false, true);
+ tapperWorker_addHits(t->posn1r, t->posn1rLen, g, s, true, true);
+
+ tapperWorker_addHits(t->posn2f, t->posn2fLen, g, s, false, false);
+ tapperWorker_addHits(t->posn2r, t->posn2rLen, g, s, true, false);
+
+ // Quit if nothing there.
+
+ if (s->tag1hitsLen + s->tag2hitsLen == 0)
+ return;
+
+ //
+ // If mated, tease out any valid mate relationships and build the
+ // results. If fragment, just build.
+ //
+
+#ifdef VERBOSEWORKER
+ fprintf(stderr, "REPORT.\n");
+#endif
+
+ // OUTPUT CASE 1 - nothing.
+ if ((s->tag1size == 0) && (s->tag2size == 0)) {
+ assert(0);
+
+ // OUTPUT CASE 2 - unmated fragments
+ } else if ((s->tag1size > 0) && (s->tag2size == 0)) {
+ s->resultFragment = new tapperResultFragment [s->tag1hitsLen];
+ s->resultFragmentLen = s->tag1hitsLen;
+
+ memset(s->resultFragment, 0, sizeof(tapperResultFragment) * s->tag1hitsLen);
+
+ for (uint32 i=0; i<s->tag1hitsLen; i++) {
+ s->resultFragment[i]._seq = s->tag1hits[i]._seqIdx;
+ s->resultFragment[i]._pos = s->tag1hits[i]._seqPos;
+
+ s->resultFragment[i]._qual._tag1valid = 1;
+ s->resultFragment[i]._qual._tag1basesMismatch = s->tag1hits[i]._basesMismatch;
+ s->resultFragment[i]._qual._tag1colorMismatch = s->tag1hits[i]._colorMismatch;
+ s->resultFragment[i]._qual._tag1colorInconsistent = s->tag1hits[i]._colorInconsistent;
+ s->resultFragment[i]._qual._tag1rev = s->tag1hits[i]._rev;
+
+ s->resultFragment[i]._qual._diffSize = MAX_COLOR_MISMATCH_MAPPED;
+
+ memcpy(s->resultFragment[i]._qual._tag1colorDiffs,
+ s->tag1hits[i]._tagColorDiffs,
+ sizeof(uint8) * MAX_COLOR_MISMATCH_MAPPED);
+ }
+
+ // OUTPUT CASE 3 - unmated fragments (but wrong set, should always be in tag1)
+ } else if ((s->tag1size == 0) && (s->tag2size > 0)) {
+ assert(0);
+
+ // OUTPUT CASE 4 - mated fragments
+ } else if ((s->tag1size > 0) && (s->tag2size > 0)) {
+ if (t->tangle == 0L)
+ t->tangle = new intervalList<uint64> [g->GS->getNumberOfSequences()];
+
+ if ((t->numHappiesMax < s->tag1hitsLen) || (t->numHappiesMax < s->tag2hitsLen)) {
+ delete [] t->tag1happies;
+ delete [] t->tag1mate;
+ delete [] t->tag1tangled;
+
+ delete [] t->tag2happies;
+ delete [] t->tag2mate;
+ delete [] t->tag2tangled;
+
+ t->numHappiesMax = MAX(s->tag1hitsLen, s->tag2hitsLen) + 16 * 1024;
+
+ fprintf(stderr, "Reallocate t->numHappiesMax to "uint32FMT"\n", t->numHappiesMax);
+
+ t->tag1happies = new uint32 [t->numHappiesMax];
+ t->tag1mate = new uint32 [t->numHappiesMax];
+ t->tag1tangled = new uint32 [t->numHappiesMax];
+
+ t->tag2happies = new uint32 [t->numHappiesMax];
+ t->tag2mate = new uint32 [t->numHappiesMax];
+ t->tag2tangled = new uint32 [t->numHappiesMax];
+ }
+
+#ifdef VERBOSEWORKER
+ fprintf(stderr, " Found "uint32FMT" and "uint32FMT" hits.\n", s->tag1hitsLen, s->tag2hitsLen);
+#endif
+
+ // Sort by position.
+ s->sortHitsByPosition();
+
+ uint32 mean = g->TF->metaData()->mean();
+ uint32 stddev = g->TF->metaData()->stddev();
+
+ tapperHit *t1h = s->tag1hits;
+ tapperHit *t2h = s->tag2hits;
+
+ // Pass zero, clear. Tangles are cleared below.
+ //
+ memset(t->tag1happies, 0, sizeof(uint32) * s->tag1hitsLen);
+ memset(t->tag1tangled, 0, sizeof(uint32) * s->tag1hitsLen);
+ memset(t->tag2happies, 0, sizeof(uint32) * s->tag2hitsLen);
+ memset(t->tag2tangled, 0, sizeof(uint32) * s->tag2hitsLen);
+
+ // Pass one. Count the number of times each fragment is in a
+ // happy relationship.
+ //
+ {
+#ifdef DEBUG_MATES
+ uint32 debug_numHappies = 0;
+ uint64 debug_happyCheck = 0;
+
+ for (uint32 a=0; a<s->tag1hitsLen; a++) {
+ for (uint32 b=0; b<s->tag2hitsLen; b++) {
+ if (t1h[a].happy(t2h[b], mean, stddev)) {
+ debug_numHappies += 1;
+ debug_happyCheck += t1h[a]._seqPos ^ t2h[b]._seqPos;
+ }
+ }
+ }
+#endif
+
+ uint32 bbaserev = 0;
+ uint32 bbasefor = 0;
+
+ for (uint32 a=0; a<s->tag1hitsLen; a++) {
+
+ // Both lists of hits are sorted by position. For each tag1 (a)
+ // hit, we first advance the bbase to the first hit that is
+ // within the proper distance before the a tag. Then scan forward
+ // until the b tag is too far away to be mated.
+
+ uint32 b = 0;
+
+ if (t1h[a]._rev == true) {
+ while ((bbaserev < s->tag2hitsLen) && (t1h[a].mateTooFarBefore(t2h[bbaserev], mean, stddev)))
+ bbaserev++;
+ b = bbaserev;
+ } else {
+ while ((bbasefor < s->tag2hitsLen) && (t1h[a].mateTooFarBefore(t2h[bbasefor], mean, stddev)))
+ bbasefor++;
+ b = bbasefor;
+ }
+
+ // Now, until the b read is too far away to be mated, check
+ // for happiness and do stuff.
+
+ for (; (b<s->tag2hitsLen) && (t1h[a].mateTooFarAfter(t2h[b], mean, stddev) == false); b++) {
+ if (t1h[a].happy(t2h[b], mean, stddev)) {
+
+#ifdef DEBUG_MATES
+ debug_numHappies -= 1;
+ debug_happyCheck -= t1h[a]._seqPos ^ t2h[b]._seqPos;
+#endif
+
+ // Count.
+ t->tag1happies[a]++;
+ t->tag2happies[b]++;
+
+ // Add the previous mate pair if we just became tangled.
+ // It is possible for both to be == 2, but in that case,
+ // we've already added the previous mate pair.
+ if ((t->tag1happies[a] == 2) && (t->tag2happies[b] == 1)) {
+ uint32 c = t->tag1mate[a];
+ uint32 mn = MIN(t1h[a]._seqPos, t2h[c]._seqPos);
+ uint32 mx = MAX(t1h[a]._seqPos + s->tag1size, t2h[c]._seqPos + s->tag2size);
+
+ t->tangle[t1h[a]._seqIdx].add(mn, mx-mn);
+ t->tag1tangled[a]++;
+ t->tag2tangled[c]++;
+ }
+
+ if ((t->tag1happies[a] == 1) && (t->tag2happies[b] == 2)) {
+ uint32 c = t->tag2mate[b];
+ uint32 mn = MIN(t1h[c]._seqPos, t2h[b]._seqPos);
+ uint32 mx = MAX(t1h[c]._seqPos + s->tag1size, t2h[b]._seqPos + s->tag2size);
+
+ t->tangle[t1h[c]._seqIdx].add(mn, mx-mn);
+ t->tag1tangled[c]++;
+ t->tag2tangled[b]++;
+ }
+
+ // Finally, add the current mate pair to the tangle.
+ if ((t->tag1happies[a] >= 2) || (t->tag2happies[b] >= 2)) {
+ uint32 mn = MIN(t1h[a]._seqPos, t2h[b]._seqPos);
+ uint32 mx = MAX(t1h[a]._seqPos + s->tag1size, t2h[b]._seqPos + s->tag2size);
+
+ t->tangle[t1h[a]._seqIdx].add(mn, mx-mn);
+ t->tag1tangled[a]++;
+ t->tag2tangled[b]++;
+ }
+
+ // Remember the mate; only valid if tag1happies[a] and
+ // tag2happies[b] both == 1.
+ t->tag1mate[a] = b;
+ t->tag2mate[b] = a;
+ }
+ }
+ }
+
+#ifdef DEBUG_MATES
+ if ((debug_numHappies != 0) || (debug_happyCheck != 0)) {
+ FILE *df = fopen("tapper.DEBUG_MATES.err", "w");
+
+ fprintf(df, "numHappies: "uint64FMT"\n", debug_numHappies);
+ fprintf(df, "happyCheck: "uint64FMT"\n", debug_happyCheck);
+
+ for (uint32 a=0; a<s->tag1hitsLen; a++)
+ fprintf(df, "a="uint32FMT" ori=%c pos="uint32FMT","uint32FMT"\n",
+ a, t1h[a]._rev ? 'r' : 'f', t1h[a]._seqIdx, t1h[a]._seqPos);
+
+ for (uint32 b=0; b<s->tag2hitsLen; b++)
+ fprintf(df, "b="uint32FMT" ori=%c pos="uint32FMT","uint32FMT"\n",
+ b, t2h[b]._rev ? 'r' : 'f', t2h[b]._seqIdx, t2h[b]._seqPos);
+
+ uint32 bbaserev = 0;
+ uint32 bbasefor = 0;
+
+ for (uint32 a=0; a<s->tag1hitsLen; a++) {
+ uint32 b = 0;
+
+ if (t1h[a]._rev == true) {
+ while ((bbaserev < s->tag2hitsLen) && (t1h[a].mateTooFarBefore(t2h[bbaserev], mean, stddev))) {
+ fprintf(df, "rev bbaserev <- "uint32FMT" + 1\n", bbaserev);
+ bbaserev++;
+ }
+ b = bbaserev;
+ } else {
+ while ((bbasefor < s->tag2hitsLen) && (t1h[a].mateTooFarBefore(t2h[bbasefor], mean, stddev))) {
+ fprintf(df, "rev bbasefor <- "uint32FMT" + 1\n", bbasefor);
+ bbasefor++;
+ }
+ b = bbasefor;
+ }
+
+ for (; (b<s->tag2hitsLen) && (t1h[a].mateTooFarAfter(t2h[b], mean, stddev) == false); b++) {
+ fprintf(df, "test a="uint32FMT" b="uint32FMT"\n", a, b);
+ if (t1h[a].happy(t2h[b], mean, stddev)) {
+ fprintf(df, "HAPPY CLEVER a="uint32FMT" b="uint32FMT"\n", a, b);
+ }
+ }
+ }
+
+ for (uint32 a=0; a<s->tag1hitsLen; a++) {
+ for (uint32 b=0; b<s->tag2hitsLen; b++) {
+ if (t1h[a].happy(t2h[b], mean, stddev)) {
+ fprintf(df, "HAPPY EXHAUSTIVE a="uint32FMT" b="uint32FMT"\n", a, b);
+ }
+ }
+ }
+
+ fclose(df);
+ }
+ assert(debug_numHappies == 0);
+ assert(debug_happyCheck == 0);
+#endif
+
+
+#ifdef VERBOSEWORKER
+ fprintf(stderr, " Paired.\n");
+#endif
+ }
+
+ // Allocate space for the outputs.
+
+#if 0
+ // We can kind of guess how much to grab. Not perfect. Can do a
+ // lot better.
+ //
+ s->resultFragmentLen = s->tag1hitsLen + s->tag2hitsLen;
+ s->resultSingletonLen = s->tag1hitsLen + s->tag2hitsLen;
+ s->resultTangledAlignmentLen = s->tag1hitsLen + s->tag2hitsLen;
+ s->resultMatedLen = MIN(s->tag1hitsLen, s->tag2hitsLen);
+ s->resultTangledLen = MIN(s->tag1hitsLen, s->tag2hitsLen);
+#else
+ // Count exactly how much space is needed. The test for
+ // singleton vs fragment is somewhat expensive, so we skip it.
+ //
+ for (uint32 a=0; a<s->tag1hitsLen; a++) {
+ if (t->tag1tangled[a] != 0) {
+ s->resultTangledAlignmentLen++;
+ } else if (t->tag1happies[a] == 1) {
+ s->resultMatedLen++;
+ } else {
+ s->resultSingletonLen++;
+ s->resultFragmentLen++;
+ }
+ }
+
+ for (uint32 b=0; b<s->tag2hitsLen; b++) {
+ if (t->tag2tangled[b] != 0) {
+ s->resultTangledAlignmentLen++;
+ } else if (t->tag2happies[b] == 1) {
+ s->resultMatedLen++;
+ } else {
+ s->resultSingletonLen++;
+ s->resultFragmentLen++;
+ }
+ }
+
+ s->resultMatedLen /= 2;
+
+ //s->resultFragmentLen += 8;
+ //s->resultSingletonLen += 8;
+ //s->resultTangledAlignmentLen += 8;
+ //s->resultMatedLen += 8;
+ //s->resultTangledLen += 8;
+#endif
+
+ s->resultFragment = new tapperResultFragment [s->resultFragmentLen];
+ s->resultSingleton = new tapperResultFragment [s->resultSingletonLen];
+ s->resultTangledAlignment = new tapperResultFragment [s->resultTangledAlignmentLen];
+ s->resultMated = new tapperResultMated [s->resultMatedLen];
+ s->resultTangled = new tapperResultTangled [s->resultTangledLen];
+
+ s->resultFragmentLen = 0;
+ s->resultSingletonLen = 0;
+ s->resultTangledAlignmentLen = 0;
+ s->resultMatedLen = 0;
+ s->resultTangledLen = 0;
+
+ // For anything with zero happies, emit to the
+ // singleton file.
+
+ for (uint32 a=0; a<s->tag1hitsLen; a++) {
+ tapperResultFragment *f;
+
+ if (t->tag1tangled[a] != 0) {
+ f = s->resultTangledAlignment + s->resultTangledAlignmentLen++;
+
+ } else if (t->tag1happies[a] == 1) {
+ // Happy; do nothing. We'll do it later.
+ f = 0L;
+
+ } else if (s->tag1hits[a].happyNearEnd(true, mean, stddev, g->GS->getSequenceLength(s->tag1hits[a]._seqIdx))) {
+ f = s->resultSingleton + s->resultSingletonLen++;
+
+ } else {
+ f = s->resultFragment + s->resultFragmentLen++;
+ }
+
+ if (f) {
+ memset(f, 0, sizeof(tapperResultFragment));
+
+ f->_seq = s->tag1hits[a]._seqIdx;
+ f->_pos = s->tag1hits[a]._seqPos;
+
+ f->_qual._tag1valid = 1;
+ f->_qual._tag1basesMismatch = s->tag1hits[a]._basesMismatch;
+ f->_qual._tag1colorMismatch = s->tag1hits[a]._colorMismatch;
+ f->_qual._tag1colorInconsistent = s->tag1hits[a]._colorInconsistent;
+ f->_qual._tag1rev = s->tag1hits[a]._rev;
+
+ f->_qual._diffSize = MAX_COLOR_MISMATCH_MAPPED;
+
+ memcpy(f->_qual._tag1colorDiffs,
+ s->tag1hits[a]._tagColorDiffs,
+ sizeof(uint8) * MAX_COLOR_MISMATCH_MAPPED);
+ }
+ }
+
+ for (uint32 b=0; b<s->tag2hitsLen; b++) {
+ tapperResultFragment *f;
+
+ if (t->tag2tangled[b] != 0) {
+ f = s->resultTangledAlignment + s->resultTangledAlignmentLen++;
+
+ } else if (t->tag2happies[b] == 1) {
+ // Happy; do nothing. We'll do it later.
+ f = 0L;
+
+ } else if (s->tag2hits[b].happyNearEnd(false, mean, stddev, g->GS->getSequenceLength(s->tag2hits[b]._seqIdx))) {
+ f = s->resultSingleton + s->resultSingletonLen++;
+
+ } else {
+ f = s->resultFragment + s->resultFragmentLen++;
+ }
+
+ if (f) {
+ memset(f, 0, sizeof(tapperResultFragment));
+
+ f->_seq = s->tag2hits[b]._seqIdx;
+ f->_pos = s->tag2hits[b]._seqPos;
+
+ f->_qual._tag2valid = 1;
+ f->_qual._tag2basesMismatch = s->tag2hits[b]._basesMismatch;
+ f->_qual._tag2colorMismatch = s->tag2hits[b]._colorMismatch;
+ f->_qual._tag2colorInconsistent = s->tag2hits[b]._colorInconsistent;
+ f->_qual._tag2rev = s->tag2hits[b]._rev;
+
+ f->_qual._diffSize = MAX_COLOR_MISMATCH_MAPPED;
+
+ memcpy(f->_qual._tag2colorDiffs,
+ s->tag2hits[b]._tagColorDiffs,
+ sizeof(uint8) * MAX_COLOR_MISMATCH_MAPPED);
+ }
+ }
+
+ // For anything with a pair of single happies, emit to the happy
+ // mate file.
+
+ for (uint32 a=0; a<s->tag1hitsLen; a++) {
+ uint32 b = t->tag1mate[a];
+
+ if ((t->tag1happies[a] == 1) && (t->tag2happies[b] == 1)) {
+ tapperResultMated *m = s->resultMated + s->resultMatedLen++;
+
+ memset(m, 0, sizeof(tapperResultMated));
+
+ assert(t->tag1mate[a] == b);
+ assert(t->tag2mate[b] == a);
+
+ m->_seq = s->tag1hits[a]._seqIdx;
+ m->_pos1 = s->tag1hits[a]._seqPos;
+ m->_pos2 = s->tag2hits[b]._seqPos;
+
+ m->_qual._tag1valid = 1;
+ m->_qual._tag1basesMismatch = s->tag1hits[a]._basesMismatch;
+ m->_qual._tag1colorMismatch = s->tag1hits[a]._colorMismatch;
+ m->_qual._tag1colorInconsistent = s->tag1hits[a]._colorInconsistent;
+ m->_qual._tag1rev = s->tag1hits[a]._rev;
+
+ m->_qual._tag2valid = 1;
+ m->_qual._tag2basesMismatch = s->tag2hits[b]._basesMismatch;
+ m->_qual._tag2colorMismatch = s->tag2hits[b]._colorMismatch;
+ m->_qual._tag2colorInconsistent = s->tag2hits[b]._colorInconsistent;
+ m->_qual._tag2rev = s->tag2hits[b]._rev;
+
+ m->_qual._diffSize = MAX_COLOR_MISMATCH_MAPPED;
+
+ memcpy(m->_qual._tag1colorDiffs,
+ s->tag1hits[a]._tagColorDiffs,
+ sizeof(uint8) * MAX_COLOR_MISMATCH_MAPPED);
+ memcpy(m->_qual._tag2colorDiffs,
+ s->tag2hits[b]._tagColorDiffs,
+ sizeof(uint8) * MAX_COLOR_MISMATCH_MAPPED);
+ }
+ }
+
+ // Emit and then clear the tangles.
+
+ {
+ uint32 simax = g->GS->getNumberOfSequences();
+
+ for (uint32 si=0; si<simax; si++) {
+
+ if (t->tangle[si].numberOfIntervals() > 0) {
+ t->tangle[si].merge();
+
+ for (uint32 ti=0; ti<t->tangle[si].numberOfIntervals(); ti++) {
+ tapperResultTangled *x = s->resultTangled + s->resultTangledLen++;
+
+ x->_tag1count = 0;
+ x->_tag2count = 0;
+
+ x->_seq = si;
+
+ x->_bgn = t->tangle[si].lo(ti);
+ x->_end = t->tangle[si].hi(ti);
+
+ for (uint32 a=0; a<s->tag1hitsLen; a++) {
+ if ((t->tag1tangled[a] > 0) &&
+ (x->_seq == s->tag1hits[a]._seqIdx) &&
+ (x->_bgn <= s->tag1hits[a]._seqPos) && (s->tag1hits[a]._seqPos <= x->_end))
+ x->_tag1count++;
+ }
+ for (uint32 b=0; b<s->tag2hitsLen; b++) {
+ if ((t->tag2tangled[b] > 0) &&
+ (x->_seq == s->tag2hits[b]._seqIdx) &&
+ (x->_bgn <= s->tag2hits[b]._seqPos) && (s->tag2hits[b]._seqPos <= x->_end))
+ x->_tag2count++;
+ }
+ }
+
+ // This is persistent; clear it for the next mate pair.
+ t->tangle[si].clear();
+ }
+ }
+ }
+ }
+}
+
+
+
+int
+main(int argc, char **argv) {
+ tapperGlobalData *g = new tapperGlobalData();
+
+ fprintf(stderr, "sizeof(tapperResultIndex) -- "sizetFMT"\n", sizeof(tapperResultIndex));
+ fprintf(stderr, "sizeof(tapperResultQV) -- "sizetFMT"\n", sizeof(tapperResultQV));
+ fprintf(stderr, "sizeof(tapperResultFragment) -- "sizetFMT"\n", sizeof(tapperResultFragment));
+ fprintf(stderr, "sizeof(tapperResultMated) -- "sizetFMT"\n", sizeof(tapperResultMated));
+ fprintf(stderr, "sizeof(tapperResultTangled) -- "sizetFMT"\n", sizeof(tapperResultTangled));
+ fprintf(stderr, "sizeof(tapperHit) -- "sizetFMT"\n", sizeof(tapperHit));
+ fprintf(stderr, "sizeof(tapperTag) -- "sizetFMT"\n", sizeof(tapperTag));
+
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-genomic", 2) == 0) {
+ g->genName = argv[++arg];
+ } else if (strncmp(argv[arg], "-queries", 2) == 0) {
+ g->qryName = argv[++arg];
+ } else if (strncmp(argv[arg], "-output", 2) == 0) {
+ g->outName = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-begin", 2) == 0) {
+ g->bgnRead = strtouint32(argv[++arg], 0L);
+ g->thisPartition = 0;
+ g->numPartitions = 1;
+
+ } else if (strncmp(argv[arg], "-end", 2) == 0) {
+ g->endRead = strtouint32(argv[++arg], 0L);
+ g->thisPartition = 0;
+ g->numPartitions = 1;
+
+ } else if (strncmp(argv[arg], "-partition", 2) == 0) {
+ g->thisPartition = strtouint32(argv[++arg], 0L);
+ g->numPartitions = strtouint32(argv[++arg], 0L);
+
+ } else if (strncmp(argv[arg], "-repeatthreshold", 2) == 0) {
+ g->repeatThreshold = strtouint32(argv[++arg], 0L);
+
+ } else if (strncmp(argv[arg], "-maxcolorerror", 5) == 0) {
+ g->maxColorError = strtouint32(argv[++arg], 0L);
+
+ } else if (strncmp(argv[arg], "-maxbaseerror", 5) == 0) {
+ g->maxBaseError = strtouint32(argv[++arg], 0L);
+
+ } else if (strncmp(argv[arg], "-maxmemory", 5) == 0) {
+ g->maxMemory = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-threads", 2) == 0) {
+ g->numThreads = atoi(argv[++arg]);
+
+ } else if (strncmp(argv[arg], "-verbose", 2) == 0) {
+ g->beVerbose = true;
+ } else {
+ fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);
+ err++;
+ }
+ arg++;
+ }
+ if ((err > 0) || (g->genName == 0L) || (g->qryName == 0L) || (g->outName == 0L)) {
+ fprintf(stderr, "usage: %s [opts]\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " MANDATORY\n");
+ fprintf(stderr, " -genomic genomic.fasta\n");
+ fprintf(stderr, " -queries tags.tapperTags\n");
+ fprintf(stderr, " -output tapperResultFile directory path\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " OPTIONAL\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -begin b Start aligning at read b (or mate pair b)\n");
+ fprintf(stderr, " -end e Stop aligning at read e (or mate pair e)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -partition n m Run partition n out of m total partitions.\n");
+ fprintf(stderr, " This sets -b and -e so that the reads/mate pairs\n");
+ fprintf(stderr, " are in m partitions. Partitions start at 0 and\n");
+ fprintf(stderr, " end at m-1.\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -repeatthreshold x Do not report fragment alignments for tags\n");
+ fprintf(stderr, " with more than x alignments. Singletons, mated\n");
+ fprintf(stderr, " tags and are still reported and computed using\n");
+ fprintf(stderr, " all alignments. The default is "uint32FMT".\n", g->repeatThreshold);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -maxcolorerror n\n");
+ fprintf(stderr, " -maxbaseerror n\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -maxmemory m (MB)\n");
+ fprintf(stderr, " -threads n\n");
+ fprintf(stderr, " -verbose\n");
+
+ exit(1);
+ }
+
+ g->initialize();
+
+ sweatShop *ss = new sweatShop(tapperReader, tapperWorker, tapperWriter);
+
+ ss->setLoaderQueueSize(16384);
+ ss->setLoaderBatchSize(512);
+ ss->setWorkerBatchSize(1024);
+ ss->setWriterQueueSize(65536);
+
+ ss->setNumberOfWorkers(g->numThreads);
+
+ for (uint32 w=0; w<g->numThreads; w++)
+ ss->setThreadData(w, new tapperThreadData(g));
+
+ ss->run(g, g->beVerbose);
+
+ delete g;
+ delete ss;
+
+ fprintf(stderr, "\nSuccess! Bye.\n");
+ return(0);
+}
diff --git a/tapper/tapperAlignment.H b/tapper/tapperAlignment.H
new file mode 100644
index 0000000..3aa194c
--- /dev/null
+++ b/tapper/tapperAlignment.H
@@ -0,0 +1,46 @@
+
+
+
+
+class tapperAlignment {
+public:
+
+ // Except as noted, all this is the same stuff as from tapperResult.
+
+ uint64 _tagid;
+ uint32 _seq;
+ uint32 _pos;
+
+ uint8 _colorDiffs[MAX_COLOR_MISMATCH_MAPPED]; // OLD, list of errors in the align
+ uint8 _colorCorrections[MAX_COLOR_MISMATCH_MAPPED]; // NEW, list of errors in the align, after corrections
+
+ uint32 _confidence:15; // NEW, confidence score on the corrected read
+
+ uint32 _basesMismatch:4; // Number of mismatches in ACGT alignment
+ uint32 _colorMismatch:4; // Number of consistent color mismatches
+ uint32 _colorInconsistent:4; // Number of inconsistent color mismatches
+ uint32 _rev:1; // Is reverse complement
+
+ uint32 _diffSize:4; // Value of MAX_COLOR_MISMATCH_MAPPED.
+};
+
+
+class
+tapperAlignmentPositionCompare {
+public:
+ bool operator()(const tapperAlignment &a, const tapperAlignment &b) const {
+ return((a._seq < b._seq) ||
+ ((a._seq == b._seq) && (a._pos < b._pos)));
+ };
+};
+
+
+class
+tapperAlignmentTagIDAndScoreCompare {
+public:
+ bool operator()(const tapperAlignment &a, const tapperAlignment &b) const {
+ return((a._tagid < b._tagid) ||
+ ((a._tagid == b._tagid) && (a._confidence < b._confidence)));
+
+ };
+};
diff --git a/tapper/tapperComputation.H b/tapper/tapperComputation.H
new file mode 100644
index 0000000..6f86f41
--- /dev/null
+++ b/tapper/tapperComputation.H
@@ -0,0 +1,221 @@
+
+#include <algorithm>
+
+class tapperComputation {
+public:
+ tapperComputation(tapperTag *a, tapperTag *b) {
+ uint16 id[4];
+
+ tag1f.clear();
+ tag1r.clear();
+
+ tag2f.clear();
+ tag2r.clear();
+
+ tag1size = 0;
+ tag2size = 0;
+
+ // Process the tags.
+ //
+ // It's not a trivial operation (probably not even possible) to
+ // reverse-complement a SOLiD read. To reverse complement a
+ // read, we would need to construct a new reference base, but to
+ // construct that base, we need to decode the read from color to
+ // acgt. Any errors in the read prevent precise decoding, and we
+ // end up building the new reverse-complemented read with all the
+ // errors at the start. By adding the anchor base to the end,
+ // we're fixing all the bases in error.
+ //
+ // So, we need to handle reverse reads specially. Reverse the
+ // read (color-space is self-complementing), and RC the anchor
+ // base. Any downstream processes need to know that a read has
+ // the anchor at the start OR at the end.
+ //
+ // When building the mers (the for loops), yes, i=2. The first
+ // letter in the tag is the last in the adapter, and it's not in
+ // the tag. We need to skip it. The second letter (the first
+ // color) is biased by the adapter, and it will be an error 75%
+ // of the time. Skip it too.
+
+ if (a) {
+ tag1id = a->decode(id, tag1fseq, tag1fqlt);
+ tag1size = strlen(tag1fseq);
+
+ tag1f.setMerSize(tag1size-1); tag1f.setMerSpan(tag1size-1);
+ tag1r.setMerSize(tag1size-1); tag1r.setMerSpan(tag1size-1);
+
+ for (uint32 i=0, j=tag1size-1; i<tag1size; i++, j--) {
+ tag1rseq[i] = tag1fseq[j];
+ tag1rqlt[i] = tag1fqlt[j];
+ }
+ tag1rseq[tag1size-1] = complementSymbol[tag1rseq[tag1size-1]];
+ tag1rseq[tag1size] = 0;
+ tag1rqlt[tag1size] = 0;
+
+ for (uint32 i=2; i<tag1size; i++)
+ tag1f += letterToBits[tag1fseq[i]];
+
+ for (uint32 i=0; i<tag1size-2; i++)
+ tag1r += letterToBits[tag1rseq[i]];
+
+ tag1size--;
+ }
+
+ if (b) {
+ tag2id = b->decode(id, tag2fseq, tag2fqlt);
+ tag2size = strlen(tag2fseq);
+
+ tag2f.setMerSize(tag2size-1); tag2f.setMerSpan(tag2size-1);
+ tag2r.setMerSize(tag2size-1); tag2r.setMerSpan(tag2size-1);
+
+ for (uint32 i=0, j=tag2size-1; i<tag2size; i++, j--) {
+ tag2rseq[i] = tag2fseq[j];
+ tag2rqlt[i] = tag2fqlt[j];
+ }
+ tag2rseq[tag2size-1] = complementSymbol[tag2rseq[tag2size-1]];
+ tag2rseq[tag2size] = 0;
+ tag2rqlt[tag2size] = 0;
+
+ for (uint32 i=2; i<tag2size; i++)
+ tag2f += letterToBits[tag2fseq[i]];
+
+ for (uint32 i=0; i<tag2size-2; i++)
+ tag2r += letterToBits[tag2rseq[i]];
+
+ tag2size--;
+ }
+
+ //fprintf(stderr, "T1: %s/%s T2:%s/%s\n", tag1fseq, tag1rseq, tag2fseq, tag2rseq);
+
+ tag1hitsLen = 0;
+ tag1hitsMax = 16;
+ tag1hits = new tapperHit [tag1hitsMax];
+
+ tag2hitsLen = 0;
+ tag2hitsMax = 16;
+ tag2hits = new tapperHit [tag2hitsMax];
+
+ mean = 0;
+ stddev = 0;
+
+ alignQualHistogram = 0L;
+ alignQualHistogramLen = 0;
+
+ resultFragment = 0L;
+ resultFragmentLen = 0;
+
+ resultSingleton = 0L;
+ resultSingletonLen = 0;
+
+ resultTangledAlignment = 0L;
+ resultTangledAlignmentLen = 0;
+
+ resultMated = 0L;
+ resultMatedLen = 0;
+
+ resultTangled = 0L;
+ resultTangledLen = 0;
+ };
+
+ ~tapperComputation() {
+ delete [] tag1hits;
+ delete [] tag2hits;
+
+ delete [] alignQualHistogram;
+
+ delete [] resultFragment;
+ delete [] resultSingleton;
+ delete [] resultTangledAlignment;
+ delete [] resultMated;
+ delete [] resultTangled;
+ };
+
+ void addHit(tapperGlobalData *g, tapperHit& h, bool tag1) {
+
+ uint32 ii = g->TA->AQIindex(g->maxBaseError, g->maxColorError,
+ h.numberOfBaseMismatches(),
+ h.numberOfColorMismatches(),
+ h.numberOfColorInconsistencies());
+
+ if (alignQualHistogram == 0L) {
+ alignQualHistogramLen = g->TA->AQIlength(g->maxBaseError, g->maxColorError);
+ alignQualHistogram = new uint32 [alignQualHistogramLen];
+
+ memset(alignQualHistogram, 0, sizeof(uint32) * alignQualHistogramLen);
+ }
+
+ alignQualHistogram[ii]++;
+
+ if (tag1) {
+ if (tag1hitsLen >= tag1hitsMax) {
+ tag1hitsMax *= 2;
+ tapperHit *nits = new tapperHit [tag1hitsMax];
+ memcpy(nits, tag1hits, sizeof(tapperHit) * tag1hitsLen);
+ delete [] tag1hits;
+ tag1hits = nits;
+ }
+ tag1hits[tag1hitsLen++] = h;
+ } else {
+ if (tag2hitsLen >= tag2hitsMax) {
+ tag2hitsMax *= 2;
+ tapperHit *nits = new tapperHit [tag2hitsMax];
+ memcpy(nits, tag2hits, sizeof(tapperHit) * tag2hitsLen);
+ delete [] tag2hits;
+ tag2hits = nits;
+ }
+ tag2hits[tag2hitsLen++] = h;
+ }
+ };
+
+ void sortHitsByPosition(void) {
+ tapperHitPositionCompare pc;
+ std::sort(tag1hits, tag1hits+tag1hitsLen, pc);
+ std::sort(tag2hits, tag2hits+tag2hitsLen, pc);
+ };
+
+public:
+ kMer tag1f, tag1r;
+ kMer tag2f, tag2r;
+
+ uint32 tag1size;
+ uint32 tag2size;
+
+ uint64 tag1id;
+ uint64 tag2id;
+
+ char tag1fseq[TAG_LEN_MAX], tag1rseq[TAG_LEN_MAX];
+ char tag2fseq[TAG_LEN_MAX], tag2rseq[TAG_LEN_MAX];
+
+ uint64 tag1fqlt[TAG_LEN_MAX], tag1rqlt[TAG_LEN_MAX];
+ uint64 tag2fqlt[TAG_LEN_MAX], tag2rqlt[TAG_LEN_MAX];
+
+ uint32 tag1hitsLen;
+ uint32 tag1hitsMax;
+ tapperHit *tag1hits;
+
+ uint32 tag2hitsLen;
+ uint32 tag2hitsMax;
+ tapperHit *tag2hits;
+
+ uint32 mean;
+ uint32 stddev;
+
+ uint32 *alignQualHistogram;
+ uint32 alignQualHistogramLen;
+
+ tapperResultFragment *resultFragment;
+ uint32 resultFragmentLen;
+
+ tapperResultFragment *resultSingleton;
+ uint32 resultSingletonLen;
+
+ tapperResultFragment *resultTangledAlignment;
+ uint32 resultTangledAlignmentLen;
+
+ tapperResultMated *resultMated;
+ uint32 resultMatedLen;
+
+ tapperResultTangled *resultTangled;
+ uint32 resultTangledLen;
+};
+
diff --git a/tapper/tapperGlobalData.H b/tapper/tapperGlobalData.H
new file mode 100644
index 0000000..12faab6
--- /dev/null
+++ b/tapper/tapperGlobalData.H
@@ -0,0 +1,217 @@
+#include "positionDB.H"
+#include "seqCache.H"
+
+#if defined (__SVR4) && defined (__sun)
+// Solaris defines SS and GS in sys/regset.h
+#undef GS
+#undef SS
+#endif
+
+class tapperGlobalData {
+public:
+ tapperGlobalData();
+ ~tapperGlobalData();
+
+ void initialize(void);
+
+private:
+ void convertACGTtoColor(char *color, char *acgt, uint32 len);
+ void rewriteFileAsColorACGT(char *acgtname, char *colorname);
+
+public:
+ char *genName;
+ char *qryName;
+ char *outName;
+
+ uint32 bgnRead;
+ uint32 endRead;
+
+ uint32 thisPartition;
+ uint32 numPartitions;
+
+ uint32 repeatThreshold;
+
+ uint32 maxMemory;
+ uint32 numThreads;
+ bool beVerbose;
+
+ uint32 tagSize;
+
+ uint32 maxColorError;
+ uint32 maxBaseError;
+
+ tapperTagFile *TF;
+ tapperResultFile *TA;
+
+ seqStream *SS;
+ merStream *MS;
+ positionDB *PS;
+
+ seqCache *GS;
+};
+
+
+
+
+
+tapperGlobalData::tapperGlobalData() {
+ genName = 0L;
+ qryName = 0L;
+ outName = 0L;
+
+ bgnRead = uint32ZERO;
+ endRead = ~uint32ZERO;
+
+ thisPartition = 0;
+ numPartitions = 1;
+
+ repeatThreshold = 500;
+
+ maxMemory = 0;
+ numThreads = 2;
+ beVerbose = false;
+
+ maxColorError = 3;
+ maxBaseError = 5;
+
+ TF = 0L;
+ TA = 0L;
+
+ SS = 0L;
+ MS = 0L;
+ PS = 0L;
+
+ GS = 0L;
+}
+
+tapperGlobalData::~tapperGlobalData() {
+ delete TF;
+ delete TA;
+ delete PS;
+ delete MS;
+ delete SS;
+ delete GS;
+}
+
+
+
+void
+tapperGlobalData::initialize(void) {
+ char colName[FILENAME_MAX];
+
+ sprintf(colName, "%s.colorspace", genName);
+ rewriteFileAsColorACGT(genName, colName);
+
+ TF = new tapperTagFile(qryName, 'r');
+
+ if (numPartitions > 1) {
+ if (thisPartition >= numPartitions) {
+ fprintf(stderr, "ERROR: invalid partition n="uint32FMT" m="uint32FMT".\n", thisPartition, numPartitions);
+ exit(1);
+ }
+
+ // File has either fragment tags OR mate pairs, never both.
+
+ uint32 numTags = (TF->numberOfFragmentTags() + TF->numberOfMatePairs()) / numPartitions + 1;
+
+ bgnRead = numTags * thisPartition;
+ endRead = numTags * thisPartition + numTags;
+
+ fprintf(stderr, "Set partition for "uint64FMT" frags or "uint64FMT" mates: -begin "uint32FMT" -end "uint32FMT"\n",
+ TF->numberOfFragmentTags(), TF->numberOfMatePairs(), bgnRead, endRead);
+ }
+
+ // Set ranges that we want to compute.
+ TF->setBegin(bgnRead);
+ TF->setEnd(endRead);
+
+ // See the comments in the loader about the -1.
+ tagSize = TF->metaData()->tagSize() - 1;
+
+ if (tagSize > 32) {
+ fprintf(stderr, "tag size too big for this implementation.\n");
+ exit(1);
+ }
+
+ fprintf(stderr, "Building seqStream\n");
+ SS = new seqStream(colName);
+
+ fprintf(stderr, "Building merStream\n");
+ MS = new merStream(new kMerBuilder(tagSize),
+ SS,
+ true, false);
+
+ sprintf(colName, "%s.ms"uint32FMT".ce"uint32FMT".posDB", genName, tagSize, maxColorError);
+
+ if (fileExists(colName)) {
+ fprintf(stderr, "Loading positionDB\n");
+ PS = new positionDB(colName, tagSize, 0, maxColorError);
+ } else {
+ fprintf(stderr, "Building positionDB\n");
+ PS = new positionDB(MS, tagSize, 0, 0L, 0L, 0L, 0, 0, maxColorError, maxMemory, beVerbose);
+
+ PS->saveState(colName);
+ }
+
+ delete MS;
+ MS = 0L;
+
+ GS = new seqCache(genName, 0, false);
+ GS->loadAllSequences();
+
+ TA = new tapperResultFile(outName, 'w');
+
+ // We get races unless we prebuild the AQI stuff. I don't want to
+ // make this a requirement of the constructor, since only
+ // multithreaded codes have this problem, and it is perfectly valid
+ // for a file to have alignments with different max error
+ // rates....while tapper will only write with these two maximums.
+ //
+ TA->AQIlength(maxBaseError, maxColorError);
+}
+
+
+
+// Inplace converts an acgt sequence to a color-space sequence.
+void
+tapperGlobalData::convertACGTtoColor(char *color, char *acgt, uint32 len) {
+ char l = 'n'; // We always start the color encoding assuming the -1 letter is a gap
+ char n = 0;
+
+ for (uint32 i=0; i<len; i++) {
+ n = acgt[i];
+ color[i] = baseToColor[l][n];
+ l = n;
+ }
+}
+
+
+
+void
+tapperGlobalData::rewriteFileAsColorACGT(char *acgtname, char *colorname) {
+ seqCache *F = new seqCache(acgtname);
+
+ if (fileExists(colorname)) {
+ fprintf(stderr, "ColorFastA '%s' exists. NOT recreating.\n", colorname);
+ return;
+ }
+
+ fprintf(stderr, "Rewriting '%s' as ColorFastA '%s'.\n", acgtname, colorname);
+
+ errno = 0;
+ FILE *CF = fopen(colorname, "w");
+ if (errno)
+ fprintf(stderr, "Failed to create '%s': %s\n", colorname, strerror(errno)), exit(1);
+
+ seqInCore *f = F->getSequenceInCore();
+ while (f) {
+ convertACGTtoColor(f->sequence(), f->sequence(), f->sequenceLength());
+ fprintf(CF, "%s\n%s\n", f->header(), f->sequence());
+ delete f;
+ f = F->getSequenceInCore();
+ }
+
+ fclose(CF);
+
+ delete F;
+}
diff --git a/tapper/tapperHit.H b/tapper/tapperHit.H
new file mode 100644
index 0000000..282b7f9
--- /dev/null
+++ b/tapper/tapperHit.H
@@ -0,0 +1,240 @@
+#include "alphabet.h"
+
+class tapperGlobalData;
+
+
+// An internal hit. Tapper uses these for computing scores and what
+// not. It outputs tapperResults, above.
+//
+class tapperHit {
+public:
+ uint32 numberOfBaseMismatches(void) { return(_basesMismatch); };
+ uint32 numberOfColorMismatches(void) { return(_colorMismatch); };
+ uint32 numberOfColorInconsistencies(void) { return(_colorInconsistent); };
+
+
+ char *printHit(char *OS, uint64 tagid) {
+ sprintf(OS, "0x"uint64FMT"\t"uint32FMT":"uint32FMT":%c\t"uint64FMT","uint64FMT","uint64FMT,
+ tagid,
+ _seqIdx, _seqPos, _rev ? '-' : '+',
+ _basesMismatch, _colorMismatch, _colorInconsistent);
+ return(OS);
+ }
+
+
+ // Returns true if the tag is near the correct end of the sequence,
+ // so that it could potentially be happily mated to a tag mapping
+ // in a different sequence (or in a gap).
+ //
+ bool alignToReference(tapperGlobalData *g,
+ uint32 so,
+ uint32 po,
+ char *tag, uint32 len);
+
+
+ bool happyNearEnd(bool isFTag, uint32 mean, uint32 stddev, uint32 seqlen) {
+ bool isHappy = false;
+
+ if (seqlen < mean + 3 * stddev)
+ return(true);
+
+ if (isFTag) {
+ if (_rev) {
+ // Near end of sequence
+ isHappy = (seqlen - mean - 3 * stddev < _seqPos);
+ } else {
+ // Near bgn of sequence
+ isHappy = (_seqPos < mean + 3 * stddev);
+ }
+ } else {
+ if (_rev) {
+ // Near bgn of sequence
+ isHappy = (_seqPos < mean + 3 * stddev);
+ } else {
+ // Near end of sequence
+ isHappy = (seqlen - mean - 3 * stddev < _seqPos);
+ }
+ }
+
+ return(isHappy);
+ };
+
+
+ // Returns true if that read is before where this read says it
+ // should be. Returns TRUE for reads of the incorrect orientation.
+ //
+ // ASSUMES it is called on this read being the forward/F3/a read.
+ //
+ bool mateTooFarBefore(tapperHit& that, uint32 mean, uint32 stddev) {
+
+ // that read is on the sequence after us.
+ if (_seqIdx < that._seqIdx) {
+ //fprintf(stderr, "isBefore()- seq after false.\n");
+ return(false);
+ }
+
+ // that read is on the sequence before us.
+ if (that._seqIdx < _seqIdx) {
+ //fprintf(stderr, "isBefore()- seq before true.\n");
+ return(true);
+ }
+
+ // Misoriented, true
+ if (_rev != that._rev) {
+ //fprintf(stderr, "isBefore()- misoriented true.\n");
+ return(true);
+ }
+
+ // FORWARD
+ //
+ // ( -that-> ) -this->
+ // -----------------------------------
+ // TTTffffffffffffffffffffffffffffffff
+ //
+ if ((_rev == false) && (that._seqPos + mean + 3 * stddev < _seqPos)) {
+ //fprintf(stderr, "isBefore()- forward true "uint32FMT" + "uint32FMT" + 3 * "uint32FMT" < "uint32FMT"\n",
+ // that._seqPos, mean, stddev, _seqPos);
+ return(true);
+ }
+
+ // REVERSE
+ //
+ // <-this- ( <-that- )
+ // -----------------------------------
+ // TTTTTTTTTTTTTTTTTTTffffffffffffffff
+ //
+ if ((_rev == true) && (that._seqPos < _seqPos + mean - 3 * stddev)) {
+ //fprintf(stderr, "isBefore()- forward true "uint32FMT" < "uint32FMT" + "uint32FMT" - 3 * "uint32FMT"\n",
+ // that._seqPos, _seqPos, mean, stddev);
+ return(true);
+ }
+
+ //fprintf(stderr, "isBefore()- false.\n");
+ return(false);
+ }
+
+
+ // Returns true if that read is after where this read says it
+ // should be. Returns FALSE for reads of the incorrect orientation.
+ //
+ // ASSUMES it is called on this read being the forward/F3/a read.
+ //
+ bool mateTooFarAfter(tapperHit& that, uint32 mean, uint32 stddev) {
+
+ // that read is on the sequence after us, true.
+ if (_seqIdx < that._seqIdx)
+ return(true);
+
+ // that read is on the sequence before us, false.
+ if (that._seqIdx < _seqIdx)
+ return(false);
+
+ // Misoriented, true
+ if (_rev != that._rev)
+ return(false);
+
+ // FORWARD
+ //
+ // ( -that-> ) -this->
+ // -----------------------------------
+ // ffffffffffffffffTTTTTTTTTTTTTTTTTTT
+ //
+ if ((_rev == false) && (that._seqPos + mean - 3 * stddev < _seqPos))
+ return(false);
+
+ // REVERSE
+ //
+ // <-this- ( <-that- )
+ // -----------------------------------
+ // ffffffffffffffffffffffffffffffffTTT
+ //
+ if ((_rev == true) && (that._seqPos < _seqPos + mean + 3 * stddev))
+ return(false);
+
+ return(true);
+ }
+
+
+
+ // ASSUMES it is called on this read being the forward/F3/a read.
+ //
+ bool happy(tapperHit& b, uint32 mean, uint32 stddev) {
+ uint64 dist = ~uint64ZERO;
+ bool isHappy = false;
+ bool isOriented = false;
+
+ if (_seqIdx != b._seqIdx)
+ return(false);
+
+ if (_rev != b._rev)
+ return(false);
+
+ // Check distance apart
+ if (b._seqPos < _seqPos)
+ dist = _seqPos - b._seqPos;
+ else
+ dist = b._seqPos - _seqPos;
+ if ((mean - 3 * stddev < dist) && (dist < mean + 3 * stddev))
+ isHappy = true;
+
+ // Check orientations
+ if ((_rev == false) && (b._seqPos < _seqPos))
+ isOriented = true;
+ if ((_rev == true) && (_seqPos < b._seqPos))
+ isOriented = true;
+
+ if (!isHappy) {
+ //fprintf(stderr, "GRUMPY DIST "uint32FMT"\n", dist);
+ return(false);
+ }
+
+ if (!isOriented) {
+ //fprintf(stderr, "GRUMPY ORIENT "uint32FMT"\n", dist);
+ return(false);
+ }
+
+ //fprintf(stderr, "HAPPY! "uint32FMT"\n", dist);
+ return(true);
+ };
+
+#if 0
+ bool operator< (tapperHit const &r) const { return(((_basesMismatch < r._basesMismatch)) ||
+ ((_basesMismatch <= r._basesMismatch) && (_colorMismatch < r._colorMismatch)) ||
+ ((_basesMismatch <= r._basesMismatch) && (_colorMismatch <= r._colorMismatch) && (_colorInconsistent < r._colorInconsistent))); };
+#endif
+
+ // Argh, should be private, but tapperWorker copies most of the hit to a result.
+ //private:
+ uint32 _seqIdx;
+ uint32 _seqPos;
+
+ uint64 _tagIdx; // 4e9 tags is only 34x of human
+
+ uint64 _len:6; // Length of tag
+ uint64 _rev:1; // Match is reversecomplement
+
+ uint64 _pad:17; // Nothing
+
+ uint64 _basesMismatch:6; // Number of mismatches in ACGT alignment
+
+ uint64 _colorMismatch:6; // Number of consistent color mismatches
+ uint64 _colorInconsistent:6; // Number of inconsistent color mismatches
+
+ char _tagCOLOR[TAG_LEN_MAX];
+ char _refCOLOR[TAG_LEN_MAX];
+
+ char _tagACGT[TAG_LEN_MAX];
+ char _refACGT[TAG_LEN_MAX];
+
+ uint8 _tagColorDiffs[MAX_COLOR_MISMATCH_MAPPED];
+};
+
+
+class
+tapperHitPositionCompare {
+public:
+ bool operator()(const tapperHit a, const tapperHit b) const {
+ return((a._seqIdx < b._seqIdx) ||
+ ((a._seqIdx == b._seqIdx) && (a._seqPos < b._seqPos)));
+ };
+};
diff --git a/tapper/tapperResult.H b/tapper/tapperResult.H
new file mode 100644
index 0000000..61b9964
--- /dev/null
+++ b/tapper/tapperResult.H
@@ -0,0 +1,580 @@
+#include "util++.H"
+
+#include <functional>
+
+// Tapper generates four kinds of alignments.
+//
+// 1) An unmated fragment alignment
+// 2) A satisfied mate pair alignment
+// 3) An unsatisfied mate pair alignment
+// 4) A tangle of mated fragments
+//
+// There are SIX output files, an index, an alignment quality
+// histogram, and the four data files.
+
+
+#define MAX_FRAGMENT_ALIGNMENTS 65536 // 16 bits
+#define MAX_FRAGMENT_ALIGNMENTS_DISCARDED 1048576 // 20 bits
+#define MAX_FRAGMENT_ALIGNMENTS_TANGLED 1048576 // 20 bits
+#define MAX_SINGLETON_ALIGNMENTS 65536 // 16 bits
+#define MAX_MATED_ALIGNMENTS 8192 // 13 bits
+#define MAX_TANGLED_ALIGNMENTS 8192 // 13 bits
+
+#define MAX_INSERT_SIZE 262144 // 18 bits
+#define MAX_INSERT_DEVIATION 65536 // 16 bits
+
+#define MAX_COLOR_MISMATCH_MAPPED 4
+
+// Info about alignments for one mate pair. One per pair or unmated
+// fragment. This is the index.
+// 256 bits.
+//
+class tapperResultIndex {
+public:
+ void print(FILE *out) {
+ uint16 id1[4];
+ uint16 id2[4];
+
+ decodeTagID(_tag1id, id1);
+ decodeTagID(_tag2id, id2);
+
+ fprintf(out, "R\t"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t"uint64FMT"/"uint64FMT"\t"uint64FMT"+-"uint64FMT"\tf:"uint64FMT"\td:"uint64FMT"\ts:"uint64FMT"\tm:"uint64FMT"\tt:"uint64FMT"\n",
+ id1[0], id1[1], id1[2], id1[3],
+ id2[0], id2[1], id2[2], id2[3],
+ _maxColrMismatchMapped, _maxBaseMismatchMapped,
+ _mean, _stddev,
+ _numFrag, _numFragDiscarded, _numFragSingleton, _numMated, _numTangled);
+ };
+
+public:
+ uint64 _tag1id;
+ uint64 _tag2id;
+
+ // Command line, how many color mismatches we looked for, and how
+ // many base mismatches we allowed. These deterine the number and
+ // meaning of the alignment quality histogram. This is stored per
+ // result, so multiple runs can be easily combined.
+ //
+ uint64 _maxColrMismatchMapped:4;
+ uint64 _maxBaseMismatchMapped:4;
+
+ uint64 _mean:18; // Expected mean and stddev for this pair.
+ uint64 _stddev:16; // Again, per result so we can combine mappings.
+ uint64 _pad1:22;
+
+ uint64 _numFrag:16; // Number of fragment alignments
+ uint64 _numFragDiscarded:20; // Number of fragment alignments found but not reported
+ uint64 _numFragTangled:20; // Number of fragment alignments in tangled mated
+ uint64 _numFragSingleton:16; // Number of fragment alignments potentially linking
+ uint64 _pad2:8;
+
+ uint64 _numMated:13; // Number of mated alignments
+ uint64 _numTangled:13; // Number of tangled alignments
+ uint64 _pad3:22;
+};
+
+
+// Quality for a mated alignment.
+// 32 bits for quality
+// 64 bits for alignment (= 2 * MAX_COLOR_MISMATCH_MAPPED * 8 bits)
+//
+// The alignments take up a lot of space. We store both the position
+// of the difference, and the color in the read.
+//
+class tapperResultQV {
+public:
+ uint32 _tag1valid:1; // Tag 1 is valid data
+ uint32 _tag1basesMismatch:4; // Number of mismatches in ACGT alignment
+ uint32 _tag1colorMismatch:4; // Number of consistent color mismatches
+ uint32 _tag1colorInconsistent:4; // Number of inconsistent color mismatches
+ uint32 _tag1rev:1; // Is reverse complement
+
+ uint32 _tag2valid:1; // Tag 2 is valid data
+ uint32 _tag2basesMismatch:4; // Number of mismatches in ACGT alignment
+ uint32 _tag2colorMismatch:4; // Number of consistent color mismatches
+ uint32 _tag2colorInconsistent:4; // Number of inconsistent color mismatches
+ uint32 _tag2rev:1; // Is reverse complement
+
+ uint32 _diffSize:4; // Value of MAX_COLOR_MISMATCH_MAPPED.
+
+ uint8 _tag1colorDiffs[MAX_COLOR_MISMATCH_MAPPED];
+ uint8 _tag2colorDiffs[MAX_COLOR_MISMATCH_MAPPED];
+};
+
+
+
+// Unmated fragment alignment.
+// 96 bits.
+//
+class tapperResultFragment {
+public:
+ void print(FILE *out, tapperResultIndex *idx) {
+ uint16 id[4];
+ char cor[128];
+ uint32 err = 0;
+
+#warning do not know real tag length
+ memset(cor, '.', 128);
+ cor[26] = 0;
+
+ if (_qual._tag1valid) {
+ for (uint32 x=0; x<_qual._tag1colorMismatch; x++, err++) {
+ uint32 pos = _qual._tag1colorDiffs[err] & 0x3f;
+ cor[pos] = '*';
+ }
+
+ for (uint32 x=0; x<_qual._tag1colorInconsistent; x++, err++) {
+ uint32 pos = _qual._tag1colorDiffs[err] & 0x3f;
+ cor[pos] = bitsToColor[_qual._tag1colorDiffs[err] >> 6];
+ }
+
+ decodeTagID(idx->_tag1id, id);
+ fprintf(stdout, "F\t"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t"uint32FMT"\t"uint32FMT"\t%c\t"uint32FMT"/"uint32FMT"/"uint32FMT"\t'%s'\n",
+ id[0], id[1], id[2], id[3],
+ _seq,
+ _pos,
+ _qual._tag1rev ? 'r' : 'f',
+ _qual._tag1basesMismatch,
+ _qual._tag1colorMismatch,
+ _qual._tag1colorInconsistent,
+ cor);
+ }
+
+ if (_qual._tag2valid) {
+ for (uint32 x=0; x<_qual._tag2colorMismatch; x++, err++) {
+ uint32 pos = _qual._tag2colorDiffs[err] & 0x3f;
+ cor[pos] = '*';
+ }
+
+ for (uint32 x=0; x<_qual._tag2colorInconsistent; x++, err++) {
+ uint32 pos = _qual._tag2colorDiffs[err] & 0x3f;
+ cor[pos] = bitsToColor[_qual._tag2colorDiffs[err] >> 6];
+ }
+
+ decodeTagID(idx->_tag2id, id);
+ fprintf(stdout, "F\t"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t"uint32FMT"\t"uint32FMT"\t%c\t"uint32FMT"/"uint32FMT"/"uint32FMT"\t'%s'\n",
+ id[0], id[1], id[2], id[3],
+ _seq,
+ _pos,
+ _qual._tag2rev ? 'r' : 'f',
+ _qual._tag2basesMismatch,
+ _qual._tag2colorMismatch,
+ _qual._tag2colorInconsistent,
+ cor);
+ }
+ };
+
+
+public:
+ uint32 _seq;
+ uint32 _pos;
+ tapperResultQV _qual;
+};
+
+
+// Satisfied mate pair alignment.
+// 128 bits.
+//
+class tapperResultMated {
+public:
+ void print(FILE *out, tapperResultIndex *idx) {
+ uint16 id1[4];
+ uint16 id2[4];
+
+ decodeTagID(idx->_tag1id, id1);
+ decodeTagID(idx->_tag2id, id2);
+
+ fprintf(stdout, "M\t"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t"uint32FMT"\t"uint32FMT"\t%c\t"uint32FMT"/"uint32FMT"/"uint32FMT"\t"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t"uint32FMT"\t"uint32FMT"\t%c\t"uint32FMT"/"uint32FMT"/"uint32FMT"\n",
+ id1[0], id1[1], id1[2], id1[3],
+ _seq,
+ _pos1,
+ _qual._tag1rev ? 'r' : 'f',
+ _qual._tag1basesMismatch,
+ _qual._tag1colorMismatch,
+ _qual._tag1colorInconsistent,
+ id2[0], id2[1], id2[2], id2[3],
+ _seq,
+ _pos2,
+ _qual._tag2rev ? 'r' : 'f',
+ _qual._tag2basesMismatch,
+ _qual._tag2colorMismatch,
+ _qual._tag2colorInconsistent);
+ };
+
+public:
+ uint32 _seq;
+ uint32 _pos1;
+ uint32 _pos2;
+ tapperResultQV _qual;
+};
+
+
+// Tangled mate pair alignment.
+// 128 bits.
+//
+class tapperResultTangled {
+public:
+ void print(FILE *out, tapperResultIndex *idx) {
+ uint16 id1[4];
+ uint16 id2[4];
+
+ decodeTagID(idx->_tag1id, id1);
+ decodeTagID(idx->_tag2id, id2);
+
+ fprintf(stdout, "T\t"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t"uint32FMT"\t"uint16FMT"_"uint16FMT"_"uint16FMT"_"uint16FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\t"uint32FMT"\n",
+ id1[0], id1[1], id1[2], id1[3],
+ _tag1count,
+ id2[0], id2[1], id2[2], id2[3],
+ _tag2count,
+ _seq,
+ _bgn,
+ _end);
+ };
+
+public:
+ uint16 _tag1count; // Number of times tag1 is in here
+ uint16 _tag2count; // Number of times tag2 is in here
+ uint32 _seq; // Sequence we hit
+ uint32 _bgn; // Beginning location
+ uint32 _end; // Ending location
+};
+
+
+
+
+class tapperResult {
+public:
+ tapperResult() {
+ memset(&idx, 0, sizeof(tapperResultIndex));
+ fragMax = 0;
+ frag = 0L;
+ singMax = 0;
+ sing = 0L;
+ taliMax = 0;
+ tali = 0L;
+ mateMax = 0;
+ mate = 0L;
+ tangMax = 0;
+ tang = 0L;
+ aqltMax = 0;
+ aqlt = 0L;
+ };
+ ~tapperResult() {
+ delete [] frag;
+ delete [] sing;
+ delete [] tali;
+ delete [] mate;
+ delete [] tang;
+ delete [] aqlt;
+ };
+
+ tapperResultIndex idx;
+
+ // A single unmated alignment
+ uint32 fragMax;
+ tapperResultFragment *frag;
+
+ // Tag in a mate pair, mapped near the end of a sequence
+ uint32 singMax;
+ tapperResultFragment *sing;
+
+ // Tag in a mate pair, involved in a tangle
+ uint32 taliMax;
+ tapperResultFragment *tali;
+
+ // Happy mated tags
+ uint32 mateMax;
+ tapperResultMated *mate;
+
+ // Location of tangle
+ uint32 tangMax;
+ tapperResultTangled *tang;
+
+ uint32 aqltMax;
+ uint32 *aqlt;
+};
+
+
+
+
+
+class tapperAlignmentQualityHistogramIndices {
+public:
+ tapperAlignmentQualityHistogramIndices() {
+ for (uint32 i=0; i<16; i++)
+ for (uint32 j=0; j<16; j++) {
+ _indices[i][j] = 0L;
+ _length[i][j] = ~uint32ZERO;
+ }
+ };
+ ~tapperAlignmentQualityHistogramIndices() {
+ for (uint32 i=0; i<16; i++)
+ for (uint32 j=0; j<16; j++)
+ delete [] _indices[i][j];
+ };
+
+ // For a given maxColorError and maxBaseError (mapper parameters),
+ // maps between (numBaseMismatch, numColorMismatch, numColorError)
+ // and an index in an array.
+ //
+ // A maximum of 16 is allowed on all values.
+
+ uint32 getLength(uint32 maxBaseError, uint32 maxColorError) {
+ generate(maxBaseError, maxColorError);
+ assert(_length[maxBaseError][maxColorError] < ~uint32ZERO);
+ return(_length[maxBaseError][maxColorError]);
+ };
+
+ uint32 getIndex(uint32 maxBaseError, uint32 maxColorError,
+ uint32 numBaseMismatch, uint32 numColorMismatch, uint32 numColorError) {
+ generate(maxBaseError, maxColorError);
+ assert(_length[maxBaseError][maxColorError] < ~uint32ZERO);
+ assert(numBaseMismatch * 256 + numColorMismatch * 16 + numColorError < 16 * 16 * 16);
+ return(_indices[maxBaseError][maxColorError][numBaseMismatch * 256 + numColorMismatch * 16 + numColorError]);
+ };
+
+private:
+ void generate(uint32 maxBaseError, uint32 maxColorError) {
+
+ if (_indices[maxBaseError][maxColorError] != 0L)
+ return;
+
+ // min base mismatches for i color mismatches - the min is (I
+ // think always) the sum of the mins for the prime decomposition.
+ // 9 - 3,3,3 -> min 6 base mismatches
+ // 9 - 2,3,4 -> min 5 base mismatches
+ // 9 - 2,2,5 -> min 5 base mismatches
+ // 9 - 2,2,2,3 -> min 5 base mismatches
+ //
+ // max base mismatches is, for the most part, used defined, but 0
+ // and 1 color mismatches are forced to 0 color mismatches.
+ //
+ // finally, it is impossible to have just one color mismatch.
+
+ // cm 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ uint32 bmmin[16] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+ uint32 maxc = maxColorError; // max color errors given to the mapper
+ uint32 maxb = maxBaseError; // max base errors given to the mapper
+
+ uint32 index = 0;
+
+ uint32 *histogramIndices = _indices[maxBaseError][maxColorError] = new uint32 [16 * 16 * 16];
+
+ for (uint32 ii=0; ii < 16 * 16 * 16; ii++)
+ histogramIndices[ii] = ~uint32ZERO;
+
+ // A special case for 0.
+ for (uint32 ce=0; ce <= maxc; ce++) {
+ //fprintf(stderr, "histogramIndices["uint32FMT"/"uint32FMT"/"uint32FMT"] = "uint32FMT"\n", 0, 0, ce, index);
+ assert(0 * 256 + 0 * 16 + ce < 16 * 16 * 16);
+ histogramIndices[0 * 256 + 0 * 16 + ce] = index++;
+ }
+
+ for (uint32 cm=2; cm <= maxc; cm++)
+ for (uint32 ce=0; ce <= maxc - cm; ce++)
+ for (uint32 bm=bmmin[cm]; bm <= maxb; bm++) {
+ //fprintf(stderr, "histogramIndices["uint32FMT"/"uint32FMT"/"uint32FMT"] = "uint32FMT"\n", bm, cm, ce, index);
+ assert(bm * 256 + cm * 16 + ce < 16 * 16 * 16);
+ histogramIndices[bm * 256 + cm * 16 + ce] = index++;
+ }
+
+ _length[maxBaseError][maxColorError] = index;
+ };
+
+
+ uint32 *_indices[16][16];
+ uint32 _length[16][16];
+};
+
+
+
+
+
+class tapperResultFile {
+public:
+ tapperResultFile(char *prefix, char mode) {
+ char filename[FILENAME_MAX];
+
+ if ((mode != 'r') && (mode != 'w'))
+ fprintf(stderr, "tapperResultFile()-- mode must be either 'r' or 'w'; you wanted '%c'\n", mode), exit(1);
+
+ if ((mode == 'r') && (!fileExists(prefix)))
+ fprintf(stderr, "tapperResultFile()-- result directory '%s' doesn't exist.\n", prefix), exit(1);
+
+ if ((mode == 'w') && (!fileExists(prefix))) {
+ errno = 0;
+ mkdir(prefix, S_IRWXU | S_IRWXG | S_IRWXO);
+ if (errno)
+ fprintf(stderr, "tapperResultFile()-- failed to make result directory '%s': %s\n",
+ prefix, strerror(errno)), exit(1);
+ }
+
+ sprintf(filename, "%s/tapperMappedIndex", prefix);
+ IDX = new recordFile(filename, 0, sizeof(tapperResultIndex), mode);
+
+ sprintf(filename, "%s/tapperMappedFragment", prefix);
+ FRAG = new recordFile(filename, 0, sizeof(tapperResultFragment), mode);
+
+ sprintf(filename, "%s/tapperMappedSingleton", prefix);
+ SING = new recordFile(filename, 0, sizeof(tapperResultFragment), mode);
+
+ sprintf(filename, "%s/tapperMappedTangledAlignment", prefix);
+ TALI = new recordFile(filename, 0, sizeof(tapperResultFragment), mode);
+
+ sprintf(filename, "%s/tapperMappedMated", prefix);
+ MATE = new recordFile(filename, 0, sizeof(tapperResultMated), mode);
+
+ sprintf(filename, "%s/tapperMappedTangled", prefix);
+ TANG = new recordFile(filename, 0, sizeof(tapperResultTangled), mode);
+
+ sprintf(filename, "%s/tapperMappedAlignQual", prefix);
+ AQLT = new recordFile(filename, 0, sizeof(uint32), mode);
+ };
+
+ ~tapperResultFile() {
+ delete IDX;
+ delete FRAG;
+ delete SING;
+ delete TALI;
+ delete MATE;
+ delete TANG;
+ delete AQLT;
+ };
+
+ static
+ bool validResultFile(char *prefix) {
+ return(fileExists(prefix));
+ };
+
+ uint32 AQIlength(uint32 maxBaseErrors, uint32 maxColorErrors) {
+ //fprintf(stderr, "AQIlength("uint32FMT","uint32FMT") -> "uint32FMT"\n",
+ // maxBaseErrors, maxColorErrors, AQI.getLength(maxBaseErrors, maxColorErrors));
+ return(AQI.getLength(maxBaseErrors, maxColorErrors));
+ };
+
+ uint32 AQIindex(uint32 maxBaseErrors, uint32 maxColorErrors,
+ uint32 numBaseMismatch, uint32 numColorMismatch, uint32 numColorError) {
+ //fprintf(stderr, "AQIindex("uint32FMT","uint32FMT","uint32FMT","uint32FMT","uint32FMT") -> "uint32FMT"\n",
+ // maxBaseErrors, maxColorErrors,
+ // numBaseMismatch, numColorMismatch, numColorError,
+ // AQI.getIndex(maxBaseErrors, maxColorErrors, numBaseMismatch, numColorMismatch, numColorError));
+ return(AQI.getIndex(maxBaseErrors, maxColorErrors, numBaseMismatch, numColorMismatch, numColorError));
+ };
+
+ bool read(tapperResult *align) {
+ bool success = true;
+
+ if (IDX->getRecord(&align->idx) == 0)
+ return(false);
+
+ uint32 aqilen = AQIlength(align->idx._maxBaseMismatchMapped, align->idx._maxColrMismatchMapped);
+
+ if (align->idx._numFrag +
+ align->idx._numFragDiscarded +
+ align->idx._numFragSingleton +
+ align->idx._numMated +
+ align->idx._numTangled == 0)
+ aqilen = 0;
+
+ if (align->idx._numFrag > align->fragMax) {
+ delete [] align->frag;
+ align->fragMax = align->idx._numFrag;
+ align->frag = new tapperResultFragment [align->fragMax];
+ }
+
+ if (align->idx._numFragSingleton > align->singMax) {
+ delete [] align->sing;
+ align->singMax = align->idx._numFragSingleton;
+ align->sing = new tapperResultFragment [align->singMax];
+ }
+
+ if (align->idx._numFragTangled > align->taliMax) {
+ delete [] align->tali;
+ align->taliMax = align->idx._numFragTangled;
+ align->tali = new tapperResultFragment [align->taliMax];
+ }
+
+ if (align->idx._numMated > align->mateMax) {
+ delete [] align->mate;
+ align->mateMax = align->idx._numMated;
+ align->mate = new tapperResultMated [align->mateMax];
+ }
+
+ if (align->idx._numTangled > align->tangMax) {
+ delete [] align->tang;
+ align->tangMax = align->idx._numTangled;
+ align->tang = new tapperResultTangled [align->tangMax];
+ }
+
+ if (aqilen > align->aqltMax) {
+ delete [] align->aqlt;
+ align->aqltMax = aqilen;
+ align->aqlt = new uint32 [align->aqltMax];
+ }
+
+#if 0
+ fprintf(stderr, "reading: "uint32FMT" "uint32FMT" "uint32FMT" "uint32FMT" "uint32FMT"\n",
+ align->idx._numFrag,
+ align->idx._numFragSingleton,
+ align->idx._numFragTangled,
+ align->idx._numMated,
+ align->idx._numTangled);
+#endif
+
+ if (FRAG->getRecord(align->frag, align->idx._numFrag) != align->idx._numFrag)
+ success = false;
+ if (SING->getRecord(align->sing, align->idx._numFragSingleton) != align->idx._numFragSingleton)
+ success = false;
+ if (TALI->getRecord(align->tali, align->idx._numFragTangled) != align->idx._numFragTangled)
+ success = false;
+ if (MATE->getRecord(align->mate, align->idx._numMated) != align->idx._numMated)
+ success = false;
+ if (TANG->getRecord(align->tang, align->idx._numTangled) != align->idx._numTangled)
+ success = false;
+
+ if (AQLT->getRecord(align->aqlt, aqilen) != aqilen)
+ success = false;
+
+ return(success);
+ };
+
+ void write(tapperResult *align) {
+ write(&align->idx, align->frag, align->sing, align->tali, align->mate, align->tang, align->aqlt);
+ };
+
+ void write(tapperResultIndex *idx,
+ tapperResultFragment *frag,
+ tapperResultFragment *sing,
+ tapperResultFragment *tali,
+ tapperResultMated *mate,
+ tapperResultTangled *tang,
+ uint32 *aqlt) {
+ IDX->putRecord(idx);
+
+ FRAG->putRecord(frag, idx->_numFrag);
+ SING->putRecord(sing, idx->_numFragSingleton);
+ TALI->putRecord(tali, idx->_numFragTangled);
+ MATE->putRecord(mate, idx->_numMated);
+ TANG->putRecord(tang, idx->_numTangled);
+
+ if (idx->_numFrag +
+ idx->_numFragDiscarded +
+ idx->_numFragSingleton +
+ idx->_numMated +
+ idx->_numTangled > 0)
+ AQLT->putRecord(aqlt, AQIlength(idx->_maxBaseMismatchMapped, idx->_maxColrMismatchMapped));
+ };
+
+private:
+ tapperAlignmentQualityHistogramIndices AQI;
+
+ recordFile *IDX;
+ recordFile *FRAG;
+ recordFile *SING;
+ recordFile *TALI;
+ recordFile *MATE;
+ recordFile *TANG;
+ recordFile *AQLT;
+};
+
diff --git a/tapper/tapperTag.H b/tapper/tapperTag.H
new file mode 100644
index 0000000..6a91704
--- /dev/null
+++ b/tapper/tapperTag.H
@@ -0,0 +1,302 @@
+#include "bio++.H"
+
+
+// A single tag, binary encoded.
+//
+// The current ascii encoding for a 25bp tag needs 44 bytes for
+// sequence and 110 bytes for QVs -> 154 bytes per tag.
+//
+// Without QVs, we can fit upto a 60bp tag into 24 bytes, using a
+// int64 global id instead of the sequence name. At 24B per tag, 10x
+// human is 27GB.
+//
+// Including QVs, we now need to use 7 bits per bp, but we then
+// truncate QV's to a maximum of 32. Not really a problem, since all
+// the files I've seen have a QV from 4 to 32 inclusive.
+//
+// The infrastructure of the bitPackedFile is used, so all we need to
+// define is the number of words in our tapperTag (which, since we
+// already do a similar hack for a kMer, isn't so terrible).
+//
+// WORDS 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+// BYTES 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128
+// TAG SIZE 0 7 17 26 35 44 53 62 71 81 90 99 108 117 126 135 (if 5 bits for qv)
+// TAG SIZE 0 6 14 22 30 38 46 54 62 70 78 86 94 102 110 118 (if 6 bits for qv)
+//
+
+#define TAPPER_TAG_WORDS 4
+#define TAG_LEN_MAX 32
+
+
+inline
+uint64
+encodeTagID(uint16 id[4]) {
+ uint64 tid;
+
+ tid = id[0];
+ tid <<= 16;
+ tid |= id[1];
+ tid <<= 16;
+ tid |= id[2];
+ tid <<= 16;
+ tid |= id[3];
+
+ return(tid);
+}
+
+
+inline
+void
+decodeTagID(uint64 tid, uint16 id[4]) {
+ id[0] = (tid >> 48) & uint64MASK(16);
+ id[1] = (tid >> 32) & uint64MASK(16);
+ id[2] = (tid >> 16) & uint64MASK(16);
+ id[3] = (tid) & uint64MASK(16);
+}
+
+
+class tapperTag {
+public:
+ tapperTag() {};
+
+ // Expects seq to be "T01023030122303" and qlt to be 0 through 31,
+ // 1-1 correspondence.
+ //
+ void encode(uint16 id[4], char *seq, uint64 *qlt) {
+ uint64 pos = 64;
+ uint32 len = strlen(seq);
+ uint32 i = 0;
+
+ for (i=1; i<TAPPER_TAG_WORDS; i++)
+ _w[i] = uint64ZERO;
+
+ _w[0] = encodeTagID(id);
+
+ setDecodedValue(_w, pos, 2, letterToBits[seq[0]]);
+ pos += 2;
+
+ setDecodedValue(_w, pos, 7, len-1);
+ pos += 7;
+
+#undef DEBUG_TAG_ENCODING
+#ifdef DEBUG_TAG_ENCODING
+ fprintf(stderr, "tapperTag::encode()-- seq: %s id "uint64HEX" %c/%d len "uint32FMT" "uint64FMT" "uint64FMT"\n",
+ seq, _w[0], seq[0], seq[0], len, pos-7, getDecodedValue(_w, pos-7, 7));
+#endif
+
+ for (i=1; i<len; i++) {
+ if (qlt[i] > 31) {
+ //fprintf(stderr, "tapperTag::encode()-- WARNING! QV too big; "uint64FMT" truncated to 31.\n", qlt[i]);
+ qlt[i] = 31;
+ }
+ setDecodedValue(_w, pos, 7, (letterToBits[seq[i]] << 5) | (qlt[i-1]));
+ pos += 7;
+ }
+ };
+
+ // Decodes a tag into seq and qlt (as for encode()), returns the
+ // tagID, or 0 if failure.
+ //
+ uint64 decode(uint16 id[4], char *seq, uint64 *qlt) {
+ uint64 pos = 64;
+ uint32 len = 0;
+ uint32 i = 0;
+
+ seq[0] = bitsToLetter[getDecodedValue(_w, pos, 2)];
+ qlt[0] = 0;
+ pos += 2;
+
+ len = getDecodedValue(_w, pos, 7) + 1;
+ pos += 7;
+
+ for (i=1; i<len; i++) {
+ uint64 x = getDecodedValue(_w, pos, 7);
+ seq[i] = bitsToColor[(x >> 5) & 0x03];
+ qlt[i-1] = x & uint64MASK(5);
+ pos += 7;
+ }
+
+ seq[len] = 0;
+ qlt[len-1] = 0;
+
+#ifdef DEBUG_TAG_ENCODING
+ fprintf(stderr, "tapperTag::decode()-- seq: %s id "uint64HEX" %c/%d len "uint32FMT"\n",
+ seq, _w[0], seq[0], seq[0], len);
+#endif
+
+ decodeTagID(_w[0], id);
+
+ return(_w[0]);
+ };
+
+ uint64 tagID(void) const { return(_w[0]); };
+ uint64 length(void) { return(getDecodedValue(_w, 66, 7)); };
+
+private:
+ friend class tapperTagFile;
+
+ uint64 _w[TAPPER_TAG_WORDS];
+};
+
+
+
+
+
+class tapperTagFileMetaData {
+public:
+ uint32 tagSize(void) { return(_minTagLen); };
+ uint32 isPairedTagFile(void) { return(_isPaired); };
+ uint32 mean(void) { return(_mean); };
+ uint32 stddev(void) { return(_stddev); };
+
+ void setMeanStdDev(uint32 mean_, uint32 stddev_) {
+ _mean = mean_;
+ _stddev = stddev_;
+ };
+
+private:
+ friend class tapperTagFile;
+ uint32 _minTagLen;
+ uint32 _maxTagLen;
+ uint32 _tagWords;
+ uint32 _isPaired;
+ uint32 _isFragment;
+ uint32 _mean;
+ uint32 _stddev;
+};
+
+
+
+
+// Notes:
+//
+// 1 Stores EITHER mated tags or fragment tags, NEVER both in the same file.
+// 2 Variable tag size for every tag (even mated tags)
+// 3 QVs too.
+// 4 Random access
+// 5 Reads are assigned a 64-bit UID
+
+
+
+class tapperTagFile {
+public:
+
+ tapperTagFile(char *name, char mode) {
+ if ((mode == 'r') && (fileExists(name) == false)) {
+ fprintf(stderr, "tapperTagFile()-- ERROR! Tag file '%s' doesn't exist.\n", name);
+ exit(1);
+ }
+
+ _tagFile = new recordFile(name, sizeof(tapperTagFileMetaData), TAPPER_TAG_WORDS * sizeof(uint64), mode);
+ _metaData = (tapperTagFileMetaData *)_tagFile->header();
+
+ if (_metaData->_tagWords == 0) {
+ _metaData->_minTagLen = ~uint32ZERO;
+ _metaData->_maxTagLen = 0;
+ _metaData->_tagWords = TAPPER_TAG_WORDS;
+ _metaData->_isPaired = 0;
+ _metaData->_isFragment = 0;
+ _metaData->_mean = 0;
+ _metaData->_stddev = 0;
+ }
+
+ if (_metaData->_tagWords != TAPPER_TAG_WORDS) {
+ fprintf(stderr, "tapperTagFile()-- ERROR! Tag file was built with TAPPER_TAG_WORDS="uint32FMT", but code has %d.\n",
+ _metaData->_tagWords, TAPPER_TAG_WORDS);
+ exit(1);
+ }
+ };
+
+
+ ~tapperTagFile() {
+ // Metadata is updated automagically when tagFile is deleted.
+ delete _tagFile;
+ };
+
+
+ tapperTagFileMetaData *metaData(void) {
+ return(_metaData);
+ };
+
+ uint64 numberOfFragmentTags(void) {
+ if (metaData()->isPairedTagFile())
+ return(0);
+ else
+ return(_tagFile->numRecords());
+ };
+
+ uint64 numberOfMatePairs(void) {
+ if (metaData()->isPairedTagFile())
+ return(_tagFile->numRecords() / 2);
+ else
+ return(0);
+ };
+
+ void setBegin(uint32 bgn) {
+ if (metaData()->isPairedTagFile())
+ _tagFile->seek(bgn * 2);
+ else
+ _tagFile->seek(bgn);
+ };
+
+
+ void setEnd(uint32 end) {
+ if (metaData()->isPairedTagFile())
+ _tagFile->limit(end * 2);
+ else
+ _tagFile->limit(end);
+ };
+
+
+ void put(tapperTag *tag) {
+ uint64 len = tag->length();
+
+ _metaData->_isFragment = 1;
+ if (_metaData->_isPaired)
+ fprintf(stderr, "tapperTagFile()-- ERROR: file contains mated tags, tried to pet a fragment tag.\n"), exit(1);
+
+ if (len < _metaData->_minTagLen) _metaData->_minTagLen = len;
+ if (_metaData->_minTagLen < len) _metaData->_maxTagLen = len;
+
+ _tagFile->putRecord(tag->_w);
+ };
+
+
+ void put(tapperTag *ta1, tapperTag *ta2) {
+ uint64 len1 = ta1->length();
+ uint64 len2 = ta2->length();
+
+ _metaData->_isPaired = 1;
+ if (_metaData->_isFragment)
+ fprintf(stderr, "tapperTagFile()-- ERROR: file contains fragment tags, tried to pet a mated tag.\n"), exit(1);
+
+ if (len1 < _metaData->_minTagLen) _metaData->_minTagLen = len1;
+ if (_metaData->_minTagLen < len1) _metaData->_maxTagLen = len1;
+
+ if (len2 < _metaData->_minTagLen) _metaData->_minTagLen = len2;
+ if (_metaData->_minTagLen < len2) _metaData->_maxTagLen = len2;
+
+ _tagFile->putRecord(ta1->_w);
+ _tagFile->putRecord(ta2->_w);
+ };
+
+
+ bool get(tapperTag *tag) {
+ if (_metaData->_isPaired == 1)
+ fprintf(stderr, "tapperTagFile()-- ERROR: file contains mated tags, tried to get a fragment tag.\n"), exit(1);
+ return(_tagFile->getRecord(tag->_w) == 1);
+ };
+
+
+ bool get(tapperTag *ta1, tapperTag *ta2) {
+ if (_metaData->_isFragment == 1)
+ fprintf(stderr, "tapperTagFile()-- ERROR: file contains fragment tags, tried to get a mated tag.\n"), exit(1);
+ return((_tagFile->getRecord(ta1->_w) == 1) &&
+ (_tagFile->getRecord(ta2->_w) == 1));
+ };
+
+private:
+ tapperTagFileMetaData *_metaData;
+ recordFile *_tagFile;
+};
+
diff --git a/tapper/tapperThreadData.H b/tapper/tapperThreadData.H
new file mode 100644
index 0000000..6940139
--- /dev/null
+++ b/tapper/tapperThreadData.H
@@ -0,0 +1,73 @@
+class tapperThreadData {
+public:
+ tapperThreadData(tapperGlobalData *g) {
+ posn1fMax = 256 * 1024;
+ posn1fLen = 0;
+ posn1f = new uint64 [posn1fMax];
+
+ posn1rMax = 256 * 1024;
+ posn1rLen = 0;
+ posn1r = new uint64 [posn1rMax];
+
+ posn2fMax = 256 * 1024;
+ posn2fLen = 0;
+ posn2f = new uint64 [posn2fMax];
+
+ posn2rMax = 256 * 1024;
+ posn2rLen = 0;
+ posn2r = new uint64 [posn2rMax];
+
+ numHappiesMax = 256 * 1024;
+ tag1happies = new uint32 [numHappiesMax];
+ tag1mate = new uint32 [numHappiesMax];
+ tag1tangled = new uint32 [numHappiesMax];
+ tag2happies = new uint32 [numHappiesMax];
+ tag2mate = new uint32 [numHappiesMax];
+ tag2tangled = new uint32 [numHappiesMax];
+ tangle = 0L;
+ };
+
+ ~tapperThreadData() {
+ delete [] posn1f;
+ delete [] posn1r;
+ delete [] posn2f;
+ delete [] posn2r;
+ delete [] tag1happies;
+ delete [] tag1mate;
+ delete [] tag1tangled;
+ delete [] tag2happies;
+ delete [] tag2mate;
+ delete [] tag2tangled;
+ delete [] tangle;
+ };
+
+public:
+ uint64 posn1fMax;
+ uint64 posn1fLen;
+ uint64 *posn1f;
+
+ uint64 posn1rMax;
+ uint64 posn1rLen;
+ uint64 *posn1r;
+
+ uint64 posn2fMax;
+ uint64 posn2fLen;
+ uint64 *posn2f;
+
+ uint64 posn2rMax;
+ uint64 posn2rLen;
+ uint64 *posn2r;
+
+ uint32 numHappiesMax;
+
+ uint32 *tag1happies;
+ uint32 *tag1mate;
+ uint32 *tag1tangled;
+
+ uint32 *tag2happies;
+ uint32 *tag2mate;
+ uint32 *tag2tangled;
+
+ intervalList<uint64> *tangle;
+};
+
diff --git a/tapper/tapperconvert.C b/tapper/tapperconvert.C
new file mode 100644
index 0000000..eee580b
--- /dev/null
+++ b/tapper/tapperconvert.C
@@ -0,0 +1,101 @@
+#include "tapperTag.H"
+#include "tapperResult.H"
+#include "tapperAlignment.H"
+#include "tapperHit.H"
+#include "tapperGlobalData.H"
+#include "tapperThreadData.H"
+#include "tapperComputation.H"
+
+int
+main(int argc, char **argv) {
+ char *resultName = 0L;
+
+ bool dumpIndex = false;
+ bool dumpFrag = false;
+ bool dumpSing = false;
+ bool dumpMate = false;
+ bool dumpTang = false;
+
+ bool allIndex = false;
+
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-dumpindex", 6) == 0) {
+ dumpIndex = true;
+
+ } else if (strncmp(argv[arg], "-dumpfragments", 6) == 0) {
+ dumpFrag = true;
+
+ } else if (strncmp(argv[arg], "-dumpsingleton", 6) == 0) {
+ dumpSing = true;
+
+ } else if (strncmp(argv[arg], "-dumpmated", 6) == 0) {
+ dumpMate = true;
+
+ } else if (strncmp(argv[arg], "-dumptangled", 6) == 0) {
+ dumpTang = true;
+
+ } else if (strncmp(argv[arg], "-allindex", 6) == 0) {
+ allIndex = true;
+
+ } else if (resultName == 0L) {
+ resultName = argv[arg];
+
+ } else {
+ err++;
+ }
+
+ arg++;
+ }
+ if ((err) || (resultName == 0L)) {
+ fprintf(stderr, "usage: %s [-dumpindex] [-dumpfragments] [-dumpsingletons] [-dumpmated] [-dumptangled] prefix\n", argv[0]);
+ fprintf(stderr, " -allIndex -- also dump index for unmapped fragments\n");
+ exit(1);
+ }
+
+ tapperResultFile *inp = new tapperResultFile(resultName, 'r');
+ tapperResult *res = new tapperResult;
+
+ while (inp->read(res)) {
+ if ((dumpIndex) &&
+ ((allIndex) ||
+ ((dumpFrag) && (res->idx._numFrag > 0)) ||
+ ((dumpFrag) && (res->idx._numFragDiscarded > 0)) ||
+ ((dumpSing) && (res->idx._numFragSingleton > 0)) ||
+ ((dumpMate) && (res->idx._numMated > 0)) ||
+ ((dumpTang) && (res->idx._numTangled > 0))))
+ res->idx.print(stdout);
+
+ if (dumpFrag)
+ for (uint32 i=0; i<res->idx._numFrag; i++)
+ res->frag[i].print(stdout, &res->idx);
+
+ if (dumpSing)
+ for (uint32 i=0; i<res->idx._numFragSingleton; i++)
+ res->sing[i].print(stdout, &res->idx);
+
+ if (dumpMate)
+ for (uint32 i=0; i<res->idx._numMated; i++)
+ res->mate[i].print(stdout, &res->idx);
+
+ if (dumpTang)
+ for (uint32 i=0; i<res->idx._numTangled; i++) {
+ res->tang[i].print(stdout, &res->idx);
+
+ for (uint32 j=0; j<res->idx._numFragTangled; j++) {
+ if ((res->tang[i]._seq == res->tali[j]._seq) &&
+ (res->tang[i]._bgn <= res->tali[j]._pos) && (res->tali[j]._pos <= res->tang[i]._end)) {
+ res->tali[j].print(stdout, &res->idx);
+ }
+ }
+
+
+ }
+ }
+
+ delete inp;
+ delete res;
+
+ exit(0);
+}
diff --git a/tapper/tappererrorcorrect.C b/tapper/tappererrorcorrect.C
new file mode 100644
index 0000000..ad3bce7
--- /dev/null
+++ b/tapper/tappererrorcorrect.C
@@ -0,0 +1,277 @@
+#include "util++.H"
+
+#include "tapperTag.H"
+#include "tapperResult.H"
+#include "tapperAlignment.H"
+#include "tapperHit.H"
+#include "tapperGlobalData.H"
+#include "tapperThreadData.H"
+#include "tapperComputation.H"
+
+
+
+class alignmentList {
+public:
+ alignmentList(recordFile *inp) {
+ alignsMax = 16;
+ aligns = new tapperAlignment * [alignsMax];
+ alignsLen = new uint32 [alignsMax];
+ alignsPerBlock = 16384;
+ alignsInp = inp;
+
+ for (uint32 i=0; i<alignsMax; i++) {
+ aligns[i] = new tapperAlignment [alignsPerBlock];
+ alignsLen[i] = alignsInp->getRecord(aligns[i], alignsPerBlock);
+ fprintf(stderr, "block "uint32FMT" has "uint32FMT" things.\n", i, alignsLen[i]);
+ }
+ };
+
+ ~alignmentList() {
+ for (uint32 i=0; i<alignsMax; i++)
+ delete [] aligns[i];
+
+ delete [] aligns;
+ };
+
+ // If the last element in the first block is below the specified
+ // seq,pos, we can dump all those alignments and get more.
+ //
+ void trimBeforeSeqPos(uint32 seq, uint32 pos) {
+
+ trimBeforeSeqPosAgain:
+ if (alignsLen[0] == 0)
+ return;
+
+ if ((aligns[0][alignsLen[0]-1]._seq <= seq) &&
+ (aligns[0][alignsLen[0]-1]._pos < pos)) {
+ tapperAlignment *save = aligns[0];
+
+ fprintf(stderr, "block[0] - seq "uint32FMT" pos "uint32FMT"\n",
+ aligns[0][alignsLen[0]-1]._seq,
+ aligns[0][alignsLen[0]-1]._pos);
+
+ for (uint32 i=1; i<alignsMax; i++) {
+ aligns[i-1] = aligns[i];
+ alignsLen[i-1] = alignsLen[i];
+ }
+
+ aligns[alignsMax-1] = save;
+
+ alignsLen[alignsMax-1] = alignsInp->getRecord(aligns[alignsMax-1], alignsPerBlock);
+
+ fprintf(stderr, "block "uint32FMT" has "uint32FMT" things.\n", alignsMax-1, alignsLen[alignsMax-1]);
+
+ goto trimBeforeSeqPosAgain;
+ }
+ };
+
+ tapperAlignment *operator[](uint32 x) {
+ uint32 block = x / alignsPerBlock;
+ uint32 piece = x % alignsPerBlock;
+
+ if (piece < alignsLen[block])
+ return(aligns[block] + piece);
+
+ return(0L);
+ };
+
+ bool empty(void) {
+ return(alignsLen[0] == 0);
+ };
+
+private:
+ uint32 alignsMax;
+
+ tapperAlignment **aligns;
+ uint32 *alignsLen;
+
+ uint32 alignsPerBlock;
+
+ recordFile *alignsInp;
+};
+
+
+
+
+
+int
+main(int argc, char **argv) {
+ char *outputName = 0L;
+ char *inputName = 0L;
+
+ uint64 memoryLimit = 1024 * 1024 * 1024;
+
+ {
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-memory", 2) == 0) {
+ memoryLimit = strtouint64(argv[++arg], 0L) * 1024 * 1024;
+
+ } else if (strncmp(argv[arg], "-output", 2) == 0) {
+ outputName = argv[++arg];
+
+ } else if (strncmp(argv[arg], "-input", 2) == 0) {
+ inputName = argv[++arg];
+
+ } else {
+ err++;
+ }
+
+ arg++;
+ }
+ if ((err) || (inputName == 0) || (outputName == 0L)) {
+ fprintf(stderr, "usage: %s [-memory X (MB)] -output prefix -input inp.tapperAlignment\n", argv[0]);
+ exit(1);
+ }
+ }
+
+ recordFile *inp = new recordFile(inputName, 0, sizeof(tapperAlignment), 'r');
+ alignmentList all(inp);
+
+ uint32 winSz = 200;
+ uint32 winLo = 0;
+ uint32 winHi = winLo + winSz;
+
+ uint32 linesMax = 1024;
+
+ char lines[1024][256];
+ uint32 lineLen[1024];
+
+ uint16 id[4];
+
+ while (all.empty() == false) {
+ memset(lines, ' ', sizeof(char) * linesMax * 256);
+
+ for (uint32 i=0; i<linesMax; i++)
+ lineLen[i] = 0;
+
+ for (uint32 a=0; (all[a] != 0L) && (all[a]->_pos < winHi); a++) {
+ tapperAlignment *rec = all[a];
+
+ // XXX we lose reads that wrap into our region
+
+ if (winLo < rec->_pos) {
+ for (uint32 l=0; l<linesMax; l++) {
+ if (lineLen[l] < rec->_pos - winLo) {
+
+ //fprintf(stdout, "at l="uint32FMT" x="uint32FMT" len="uint32FMT"\n", l, rec->_pos - winLo, lineLen[l]);
+
+#warning need the real read size here
+
+ for (uint32 x=rec->_pos - winLo; x<rec->_pos - winLo + 25; x++)
+ lines[l][x] = '.';
+
+ // Needed so we can disable ID printing.
+ lines[l][rec->_pos - winLo + 25] = 0;
+
+#undef WITH_IDS
+#ifdef WITH_IDS
+ decodeTagID(rec->_tagid, id);
+
+ sprintf(lines[l] + rec->_pos - winLo + 25, " %c "uint16FMTW(05)"-"uint16FMTW(05)"-"uint16FMTW(05)"-"uint16FMTW(05)" ",
+ (rec->_rev) ? '<' : '>',
+ id[0], id[1], id[2], id[3]);
+#endif
+
+ lineLen[l] = strlen(lines[l]); // Convert that trailing nul into a whitespace.
+ lines[l][lineLen[l]] = ' ';
+
+ uint32 err = 0;
+
+ for (uint32 x=0; x<rec->_colorMismatch; x++) {
+ uint32 pos = rec->_colorDiffs[err] & 0x3f;
+ char let = '*'; //bitsToColor[rec->_colorDiffs[err] >> 6];
+
+ lines[l][rec->_pos - winLo + pos] = let;
+
+ err++;
+ }
+
+ for (uint32 x=0; x<rec->_colorInconsistent; x++) {
+ uint32 pos = rec->_colorDiffs[err] & 0x3f;
+ char let = bitsToColor[rec->_colorDiffs[err] >> 6];
+
+ lines[l][rec->_pos - winLo + pos] = let;
+
+ err++;
+ }
+
+ l = linesMax;
+ }
+ }
+ }
+ }
+
+ bool stuff = false;
+
+ for (uint32 i=0; i<linesMax; i++)
+ if (lineLen[i] > 0)
+ stuff = true;
+
+ if (stuff) {
+ fprintf(stdout, "\nALIGN "uint32FMT"-"uint32FMT"\n", winLo, winHi);
+ fprintf(stdout, " 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0\n");
+ fprintf(stdout, " 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890\n");
+
+ for (uint32 i=0; i<linesMax; i++) {
+ if (lineLen[i] > 0) {
+ lines[i][lineLen[i]] = 0;
+ fprintf(stdout, uint32FMTW(03)"] %s\n", i, lines[i]);
+ }
+ }
+ }
+
+ winLo = winHi;
+ winHi = winLo + winSz;
+
+ all.trimBeforeSeqPos(0, winLo);
+ }
+
+ delete inp;
+
+ exit(0);
+}
+
+
+
+
+
+
+
+
+
+
+
+#if 0
+ sprintf(linp, "rec "uint64HEX" "uint32FMT":"uint32FMT,
+ rec->_tagid,
+ rec->_seq,
+ rec->_pos);
+ while (*linp)
+ linp++;
+
+ uint32 err = 0;
+
+ for (uint32 x=0; x<rec->_colorMismatch; x++) {
+ sprintf(linp, " M:%c@%02d(%07d)",
+ bitsToColor[rec->_colorDiffs[err] >> 6],
+ (rec->_colorDiffs[err] & 0x3f),
+ (rec->_colorDiffs[err] & 0x3f) + rec->_pos);
+ while (*linp)
+ linp++;
+ err++;
+ }
+
+ for (uint32 x=0; x<rec->_colorInconsistent; x++) {
+ sprintf(linp, " E:%c@%02d(%07d)",
+ bitsToColor[rec->_colorDiffs[err] >> 6],
+ (rec->_colorDiffs[err] & 0x3f),
+ (rec->_colorDiffs[err] & 0x3f) + rec->_pos);
+ while (*linp)
+ linp++;
+ err++;
+ }
+
+ fprintf(stdout, "%s\n", line);
+#endif
diff --git a/tapper/tappermerge.C b/tapper/tappermerge.C
new file mode 100644
index 0000000..3c11d25
--- /dev/null
+++ b/tapper/tappermerge.C
@@ -0,0 +1,60 @@
+#include "tapperTag.H"
+#include "tapperResult.H"
+#include "tapperAlignment.H"
+#include "tapperHit.H"
+#include "tapperGlobalData.H"
+#include "tapperThreadData.H"
+#include "tapperComputation.H"
+
+int
+main(int argc, char **argv) {
+ char *outName = 0L;
+ uint32 inputsLen = 0;
+ char *inputs[8192];
+
+ // Parse and check the inputs.
+
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-output", 2) == 0) {
+ outName = argv[++arg];
+
+ } else {
+ if (tapperResultFile::validResultFile(argv[arg]) == false) {
+ fprintf(stderr, "Didn't find tapperResultFile '%s'\n", argv[arg]);
+ err++;
+ } else {
+ inputs[inputsLen++] = argv[arg];
+ }
+ }
+ arg++;
+ }
+ if ((err) || (inputsLen == 0)) {
+ fprintf(stderr, "usage: %s -output out-directory in-directory [in-directory ...]\n", argv[0]);
+ exit(1);
+ }
+
+ // Open the output file
+
+ tapperResultFile *out = new tapperResultFile(outName, 'w');
+
+ // Loop over the inputs, copying to the output. We could be much
+ // looser here, just blindly copying all records in each file, but
+ // we'll be a little more careful, and copy frag by frag.
+
+ for (uint32 inputsIdx=0; inputsIdx<inputsLen; inputsIdx++) {
+ tapperResultFile *inp = new tapperResultFile(inputs[inputsIdx], 'r');
+ tapperResult *res = new tapperResult;
+
+ while (inp->read(res))
+ out->write(res);
+
+ delete inp;
+ delete res;
+ }
+
+ delete out;
+
+ exit(0);
+}
diff --git a/tapper/tappersort.C b/tapper/tappersort.C
new file mode 100644
index 0000000..fa032f2
--- /dev/null
+++ b/tapper/tappersort.C
@@ -0,0 +1,308 @@
+#include "util++.H"
+
+#include "tapperTag.H"
+#include "tapperResult.H"
+#include "tapperAlignment.H"
+#include "tapperHit.H"
+#include "tapperGlobalData.H"
+#include "tapperThreadData.H"
+#include "tapperComputation.H"
+
+
+
+// Reads a tapperAlignmentFile, converts all the alignments to
+// tapperAlignments (loses mate pair information), and sorts by
+// position on the reference.
+
+// There are (at least) two ways to sort. Merge sort or bucket sort.
+//
+// Bucket sort is a little easier, but, without knowing the length of
+// the reference sequences, we cannot map seq,pos to a bucket. We
+// also have no memory guarantee; it is possible to have a bucket get
+// too big.
+//
+// Merge sort is more difficult, because of the merge. We have a
+// memory size guarantee though.
+
+
+uint32
+saveFrag(tapperAlignment *ali, uint32 aliLen, tapperResult *res, uint32 fragLen, tapperResultFragment *frag) {
+
+ for (uint32 i=0; i<fragLen; i++) {
+ tapperResultFragment *f = frag + i;
+
+ // At least one is true, and at least one is false ==> exactly
+ // one is true.
+
+ if ((f->_qual._tag1valid == 0) && (f->_qual._tag2valid == 0))
+ fprintf(stderr, "error\n");
+
+ assert((f->_qual._tag1valid == 1) || (f->_qual._tag2valid == 1));
+ assert((f->_qual._tag1valid == 0) || (f->_qual._tag2valid == 0));
+
+ if (f->_qual._tag1valid) {
+ memset(ali + aliLen, 0, sizeof(tapperAlignment));
+
+ ali[aliLen]._tagid = res->idx._tag1id;
+ ali[aliLen]._seq = f->_seq;
+ ali[aliLen]._pos = f->_pos;
+ ali[aliLen]._basesMismatch = f->_qual._tag1basesMismatch;
+ ali[aliLen]._colorMismatch = f->_qual._tag1colorMismatch;
+ ali[aliLen]._colorInconsistent = f->_qual._tag1colorInconsistent;
+ ali[aliLen]._rev = f->_qual._tag1rev;
+
+ ali[aliLen]._diffSize = f->_qual._diffSize;
+
+ memcpy(ali[aliLen]._colorDiffs,
+ f->_qual._tag1colorDiffs,
+ sizeof(uint8) * MAX_COLOR_MISMATCH_MAPPED);
+
+ aliLen++;
+ }
+
+ if (f->_qual._tag2valid) {
+ memset(ali + aliLen, 0, sizeof(tapperAlignment));
+
+ ali[aliLen]._tagid = res->idx._tag2id;
+ ali[aliLen]._seq = f->_seq;
+ ali[aliLen]._pos = f->_pos;
+ ali[aliLen]._basesMismatch = f->_qual._tag2basesMismatch;
+ ali[aliLen]._colorMismatch = f->_qual._tag2colorMismatch;
+ ali[aliLen]._colorInconsistent = f->_qual._tag2colorInconsistent;
+ ali[aliLen]._rev = f->_qual._tag2rev;
+
+ ali[aliLen]._diffSize = f->_qual._diffSize;
+
+ memcpy(ali[aliLen]._colorDiffs,
+ f->_qual._tag2colorDiffs,
+ sizeof(uint8) * MAX_COLOR_MISMATCH_MAPPED);
+
+ aliLen++;
+ }
+ }
+
+ return(aliLen);
+}
+
+
+
+uint32
+saveMate(tapperAlignment *ali, uint32 aliLen, tapperResult *res) {
+
+ for (uint32 i=0; i<res->idx._numMated; i++) {
+ tapperResultMated *m = res->mate + i;
+
+ memset(ali + aliLen, 0, sizeof(tapperAlignment));
+
+ ali[aliLen]._tagid = res->idx._tag1id;
+ ali[aliLen]._seq = m->_seq;
+ ali[aliLen]._pos = m->_pos1;
+ ali[aliLen]._basesMismatch = m->_qual._tag1basesMismatch;
+ ali[aliLen]._colorMismatch = m->_qual._tag1colorMismatch;
+ ali[aliLen]._colorInconsistent = m->_qual._tag1colorInconsistent;
+ ali[aliLen]._rev = m->_qual._tag1rev;
+
+ ali[aliLen]._diffSize = m->_qual._diffSize;
+
+ memcpy(ali[aliLen]._colorDiffs,
+ m->_qual._tag1colorDiffs,
+ sizeof(uint8) * MAX_COLOR_MISMATCH_MAPPED);
+
+ aliLen++;
+
+ memset(ali + aliLen, 0, sizeof(tapperAlignment));
+
+ ali[aliLen]._tagid = res->idx._tag2id;
+ ali[aliLen]._seq = m->_seq;
+ ali[aliLen]._pos = m->_pos2;
+ ali[aliLen]._basesMismatch = m->_qual._tag2basesMismatch;
+ ali[aliLen]._colorMismatch = m->_qual._tag2colorMismatch;
+ ali[aliLen]._colorInconsistent = m->_qual._tag2colorInconsistent;
+ ali[aliLen]._rev = m->_qual._tag2rev;
+
+ ali[aliLen]._diffSize = m->_qual._diffSize;
+
+ memcpy(ali[aliLen]._colorDiffs,
+ m->_qual._tag2colorDiffs,
+ sizeof(uint8) * MAX_COLOR_MISMATCH_MAPPED);
+
+ aliLen++;
+ }
+
+ return(aliLen);
+}
+
+
+
+
+uint32
+sortAndDump(tapperAlignment *ali, uint32 aliLen, char *outputName, uint32 &outputIndex) {
+ char filename[FILENAME_MAX];
+
+ if (aliLen == 0)
+ return(0);
+
+ tapperAlignmentPositionCompare pc;
+ std::sort(ali, ali + aliLen, pc);
+
+ sprintf(filename, "%s."uint32FMTW(03)".tapperAlignment", outputName, outputIndex);
+
+ fprintf(stderr, "Writing "uint32FMT" sorted alignments to '%s'\n", aliLen, filename);
+
+ recordFile *out = new recordFile(filename, 0, sizeof(tapperAlignment), 'w');
+ out->putRecord(ali, aliLen);
+ delete out;
+
+ outputIndex++;
+
+ return(0);
+}
+
+
+
+int
+main(int argc, char **argv) {
+ char *outputName = 0L;
+ uint32 outputIndex = 0;
+ uint32 inputsLen = 0;
+ char *inputs[8192];
+ uint64 memoryLimit = 1024 * 1024 * 1024;
+
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strncmp(argv[arg], "-memory", 2) == 0) {
+ memoryLimit = strtouint64(argv[++arg], 0L) * 1024 * 1024;
+
+ } else if (strncmp(argv[arg], "-output", 2) == 0) {
+ outputName = argv[++arg];
+
+ } else {
+ if (tapperResultFile::validResultFile(argv[arg]) == false) {
+ fprintf(stderr, "Didn't find tapperResultFile '%s'\n", argv[arg]);
+ err++;
+ } else {
+ inputs[inputsLen++] = argv[arg];
+ }
+ }
+
+ arg++;
+ }
+ if ((err) || (inputsLen == 0) || (outputName == 0L)) {
+ fprintf(stderr, "usage: %s [-memory X (MB)] -output prefix input ....\n", argv[0]);
+ exit(1);
+ }
+
+
+ {
+ uint32 aliMax = memoryLimit / sizeof(tapperAlignment);
+ uint32 aliLen = 0;
+ tapperAlignment *ali = new tapperAlignment [aliMax];
+
+ fprintf(stderr, "Can fit "uint32FMT" alignments into "uint64FMT" bytes memory; "uint32FMT" bytes each.\n",
+ aliMax, memoryLimit, (uint32)sizeof(tapperAlignment));
+
+ speedCounter S(" %10.0f results (%8.0f results/sec)\r", 1, 100000, true);
+
+ for (uint32 inputsIdx=0; inputsIdx<inputsLen; inputsIdx++) {
+ tapperResultFile *inp = new tapperResultFile(inputs[inputsIdx], 'r');
+ tapperResult *res = new tapperResult;
+
+ while (inp->read(res)) {
+
+ // Sort and dump if the next result has too many alignments.
+ //
+ if (aliMax < aliLen + (res->idx._numFrag +
+ res->idx._numFragSingleton +
+ res->idx._numFragTangled +
+ res->idx._numMated * 2)) {
+ aliLen = sortAndDump(ali, aliLen, outputName, outputIndex);
+ }
+
+ aliLen = saveFrag(ali, aliLen, res, res->idx._numFrag, res->frag);
+ aliLen = saveFrag(ali, aliLen, res, res->idx._numFragSingleton, res->sing);
+ aliLen = saveFrag(ali, aliLen, res, res->idx._numFragTangled, res->tali);
+ aliLen = saveMate(ali, aliLen, res);
+
+ S.tick();
+ }
+ S.finish();
+
+ delete inp;
+ delete res;
+ }
+
+ aliLen = sortAndDump(ali, aliLen, outputName, outputIndex);
+
+ delete [] ali;
+ }
+
+ //
+ // Now the merge.
+ //
+
+ {
+ char filename[FILENAME_MAX];
+
+ tapperAlignment *ali = new tapperAlignment [outputIndex];
+ recordFile **inp = new recordFile * [outputIndex];
+ recordFile *out = 0L;
+
+ bool stillMore = true;
+ uint32 minidx = 0;
+
+ tapperAlignmentPositionCompare lessthan;
+
+ for (uint32 x=0; x<outputIndex; x++) {
+ sprintf(filename, "%s."uint32FMTW(03)".tapperAlignment", outputName, x);
+ inp[x] = new recordFile(filename, 0, sizeof(tapperAlignment), 'r');
+
+ inp[x]->getRecord(ali + x);
+ }
+
+ sprintf(filename, "%s.tapperAlignment", outputName);
+ out = new recordFile(filename, 0, sizeof(tapperAlignment), 'w');
+
+
+ while (stillMore) {
+
+ // Compare all against the current default minidx, pick the
+ // smallest alignment currently loaded.
+ for (uint32 x=0; x<outputIndex; x++)
+ if ((x != minidx) && (inp[x] != 0L) && (lessthan(ali[x], ali[minidx])))
+ minidx = x;
+
+ // Dump it.
+ out->putRecord(ali + minidx);
+
+ // Read the next record. If no next record, close the file,
+ // and pick a new default minidx
+ if (inp[minidx]->getRecord(ali + minidx) == 0) {
+ delete inp[minidx];
+ inp[minidx] = 0L;
+
+ stillMore = false;
+
+ for (uint32 x=0; x<outputIndex; x++)
+ if (inp[x] != 0L) {
+ minidx = x;
+ stillMore = true;
+ }
+ }
+ }
+
+ delete out;
+
+ for (uint32 x=0; x<outputIndex; x++) {
+ assert(inp[x] == 0L);
+
+ sprintf(filename, "%s."uint32FMTW(03)".tapperAlignment", outputName, x);
+ unlink(filename);
+ }
+
+ delete [] inp;
+ delete [] ali;
+ }
+
+ exit(0);
+}
diff --git a/trie/Make.include b/trie/Make.include
new file mode 100644
index 0000000..5ba462f
--- /dev/null
+++ b/trie/Make.include
@@ -0,0 +1,21 @@
+# -*- makefile -*-
+
+LIBUTL/ :=$(realpath $/../libutil/)/
+LIBBIO/ :=$(realpath $/../libbio/)/
+LIBSEQ/ :=$(realpath $/../libseq/)/
+LIBMERYL/ :=$(realpath $/../libmeryl/)/
+LIBKMER/ :=$(realpath $/../libkmer/)/
+
+src := $/trie.C
+
+$/.CXX_SRCS := $(filter %.C,${src})
+$/.CXX_EXES := $/trie
+
+$/.CLEAN :=$/*.o
+
+$(eval $/%.d $/%.o: CXXFLAGS+= -I${LIBUTL/} -I${LIBBIO/} -I${LIBSEQ/} -I${LIBMERYL/} -I${LIBKMER/})
+
+$/trie: ${$/.CXX_SRCS:.C=.o} \
+ ${LIBSEQ/}libseq.a \
+ ${LIBBIO/}libbio.a \
+ ${LIBUTL/}libutil.a
diff --git a/trie/trie.C b/trie/trie.C
new file mode 100644
index 0000000..a1cc1a5
--- /dev/null
+++ b/trie/trie.C
@@ -0,0 +1,330 @@
+#include "util++.H"
+#include "bio++.H"
+
+//#define ALPHALEN 4
+#define ALPHALEN 20
+
+// NOTE that our list of letters is not alphabetic. The DNA letters
+// are first, then the rest of the protein letters.
+//
+const char *trieAlpha = "acgtdefhiklmnpqrsvwy";
+uint32 trieAlphaMap[256] = {0};
+
+class trieNode {
+public:
+ trieNode(void) {
+ for (uint32 i=0; i<ALPHALEN; i++)
+ next[i] = ~uint32ZERO;
+ numseq = 0;
+ seqptr = ~uint32ZERO;
+ };
+
+ uint32 next[ALPHALEN]; // next node for A, C, G, T input, ~uint32ZERO if no next
+ uint32 numseq; // number of seqs we have in here
+ uint32 seqptr; // pointer to seqs
+};
+
+
+class trieSeqPtr {
+public:
+ trieSeqPtr(void) {
+ seqiid = ~uint32ZERO;
+ nodeiid = ~uint32ZERO;
+ defline = 0L;
+ reversed = false;
+ };
+
+ uint32 seqiid;
+ uint32 nodeiid;
+ char *defline;
+ bool reversed;
+};
+
+
+
+
+int
+trieSeqPtrCompare(const void *a, const void *b) {
+ const trieSeqPtr *A = (const trieSeqPtr *)a;
+ const trieSeqPtr *B = (const trieSeqPtr *)b;
+
+ if (A->nodeiid < B->nodeiid)
+ return(-1);
+ if (A->nodeiid > B->nodeiid)
+ return(1);
+ return(0);
+}
+
+
+uint32
+addSequence(trieNode *nodes, uint32 &nodesLen,
+ trieSeqPtr *seqptr, uint32 &seqptrLen,
+ seqInCore *S,
+ bool isReverse) {
+ char *s = 0L;
+ uint32 n = 0;
+
+ if (S->sequenceLength() < 12)
+ return(0);
+
+ for (s = S->sequence(); *s; s++)
+ if (trieAlphaMap[*s] == 0)
+ return(0);
+
+ for (s = S->sequence(); *s; s++) {
+ uint32 v = trieAlphaMap[*s] - 1;
+
+ // add a new pointer if needed
+ if (nodes[n].next[v] == ~uint32ZERO)
+ nodes[n].next[v] = nodesLen++;
+
+ // Go there
+ n = nodes[n].next[v];
+ }
+
+ // add this sequence to node i -- after all sequences have been
+ // added, we'll sort this list and build pointers.
+
+ seqptr[seqptrLen].seqiid = S->getIID();
+ seqptr[seqptrLen].nodeiid = n;
+ seqptr[seqptrLen].defline = strdup(S->header());
+ seqptr[seqptrLen].reversed = isReverse;
+ seqptrLen++;
+
+ return(1);
+}
+
+
+
+int
+main(int argc, char **argv) {
+ char *queries = 0L;
+ char *genome = 0L;
+ FILE *logfile = 0L;
+
+ int arg=1;
+ int err=0;
+ while (arg < argc) {
+ if (strcmp(argv[arg], "-q") == 0) {
+ queries = argv[++arg];
+ } else if (strcmp(argv[arg], "-g") == 0) {
+ genome = argv[++arg];
+ } else if (strcmp(argv[arg], "-l") == 0) {
+ errno = 0;
+ logfile = fopen(argv[++arg], "w");
+ if (errno)
+ fprintf(stderr, "Failed to open logfile '%s': %s\n", argv[arg], strerror(errno)), exit(1);
+ } else {
+ err++;
+ }
+ arg++;
+ }
+
+ if (queries == 0L)
+ err = 1;
+ if (genome == 0L)
+ err = 1;
+
+ if (err) {
+ fprintf(stderr, "usage: %s -q queries.fasta -g genome.fasta\n", argv[0]);
+ fprintf(stderr, " -q queries.fasta -- the input with the short stuff\n");
+ fprintf(stderr, " -q queries.fasta -- the input with the short stuff\n");
+ exit(1);
+ }
+
+ for (uint32 i=0; i<ALPHALEN; i++) {
+ trieAlphaMap[trieAlpha[i]] = i+1;
+ trieAlphaMap[trieAlpha[i] + 'A' - 'a'] = i+1;
+ }
+
+
+ // Build the trie from the queries. We just allocate a large
+ // number of nodes so we don't need to deal with reallocation.
+ //
+ // 32M * 28 = 896M.
+ //
+ uint32 nodesLen = 1;
+ uint32 nodesMax = 32 * 1024 * 1024;
+ trieNode *nodes = new trieNode [nodesMax];
+
+ seqCache *F = new seqCache(queries);
+ seqInCore *S = 0L;
+
+ uint32 seqptrLen = 0;
+ uint32 seqptrMax = 2 * 1024 * 1024;
+ trieSeqPtr *seqptr = new trieSeqPtr [seqptrMax];
+
+ // Number of matches per IID, not seqptr (which has two entries for
+ // each iid, one forward, one reverse).
+ //
+ uint32 *nummatches = new uint32 [seqptrMax];
+ for (uint32 i=0; i<seqptrMax; i++)
+ nummatches[i] = 0;
+
+ while ((S = F->getSequenceInCore()) != 0L) {
+ uint32 success = 0;
+
+ success += addSequence(nodes, nodesLen, seqptr, seqptrLen, S, false);
+
+#if ALPHALEN == 4
+ reverseComplementSequence(S->sequence(), S->sequenceLength());
+ success += addSequence(nodes, nodesLen, seqptr, seqptrLen, S, true);
+#else
+ success++;
+#endif
+
+ if (success != 2)
+ if (logfile)
+ fprintf(logfile, "Failed to add sequence '%s' ('%s').\n", S->header(), S->sequence());
+
+ if (nodesLen >= nodesMax)
+ fprintf(stderr, "ERROR: out of node space.\n"), exit(1);
+ if (seqptrLen >= seqptrMax)
+ fprintf(stderr, "ERROR: out of seqptr space.\n"), exit(1);
+
+ delete S;
+ }
+
+ delete F;
+
+ fprintf(stderr, "Used "uint32FMT" trie nodes. \n", nodesLen);
+
+ // Fix up sequence pointers - we could probably do this inplace
+ // with some trickery, but why?
+
+ qsort(seqptr, seqptrLen, sizeof(trieSeqPtr), trieSeqPtrCompare);
+
+ // Now sorted by node iid, so run through both arrays and set
+ // pointers. We point to the first thing found, and remember
+ // the number of things found.
+
+ for (uint32 i=0; i<seqptrLen; i++) {
+ uint32 ni = seqptr[i].nodeiid;
+
+ if (nodes[ni].seqptr == ~uint32ZERO)
+ nodes[ni].seqptr = i;
+
+ nodes[ni].numseq++;
+ }
+
+ //
+
+ F = new seqCache(genome);
+ S = 0L;
+ while ((S = F->getSequenceInCore()) != 0L) {
+ char *s = S->sequence();
+ uint32 siid = S->getIID();
+ uint32 spos = 0;
+
+ uint32 n[256] = {0}; // Pointer into the trie
+ uint32 d[256] = {0}; // Depth this pointer is at (== sequence length)
+ uint32 nLen = 0;
+
+ //fprintf(stderr, "WORKING ON '%s'\n", S->header());
+
+ while (*s) {
+ if (trieAlphaMap[*s] == 0) {
+
+ // Not a valid symbol, all node pointers are killed, no exact matches
+ // possible!
+ nLen = 0;
+
+ } else {
+
+ // Valid symbol. Advance all pointers, print out any
+ // matches, kill any pointers, and then finally add a new
+ // one.
+
+ uint32 v = trieAlphaMap[*s] - 1;
+ uint32 ni;
+ uint32 nj;
+
+ // Advance pointers.
+ //
+ for (ni=0; ni<nLen; ni++) {
+ n[ni] = nodes[n[ni]].next[v];
+ d[ni]++;
+ }
+
+ // Kill any thing that is now dead - copy nj into ni
+ //
+ for (ni=0, nj=0; nj<nLen; nj++)
+ if (n[nj] != ~uint32ZERO) {
+ if ((ni != nj)) {
+ n[ni] = n[nj];
+ d[ni] = d[nj];
+ }
+ ni++;
+ }
+ nLen = ni;
+
+ // Print any matches
+ //
+ for (ni=0; ni<nLen; ni++) {
+ if (nodes[n[ni]].numseq > 0) {
+ for (nj=0; nj<nodes[n[ni]].numseq; nj++) {
+ uint32 p = nodes[n[ni]].seqptr + nj;
+
+ nummatches[seqptr[p].seqiid]++;
+ if (nummatches[seqptr[p].seqiid] == 1000) {
+ if (logfile)
+ fprintf(logfile, "sequence "uint32FMT" '%s' has too many matches, not reporting any more.\n",
+ seqptr[p].seqiid,
+ seqptr[p].defline);
+ } else if (nummatches[seqptr[p].seqiid] < 1000) {
+ fprintf(stdout, "sim4begin\n");
+ fprintf(stdout, uint32FMT"["uint32FMT"-0-0] "uint32FMT"[0-0] <"uint32FMT"-0-100-%s-unknown>\n",
+ seqptr[p].seqiid,
+ d[ni] + 1,
+ siid,
+ d[ni] + 1,
+ seqptr[p].reversed ? "complement" : "forward");
+ fprintf(stdout, "edef=%s\n", seqptr[p].defline);
+ fprintf(stdout, "ddef=%s\n", S->header());
+ fprintf(stdout, "1-"uint32FMT" ("uint32FMT"-"uint32FMT") <"uint32FMT"-0-100>\n",
+ d[ni] + 1,
+ spos - d[ni] + 1,
+ spos + 1,
+ d[ni] + 1);
+ fprintf(stdout, "sim4end\n");
+ }
+ }
+ }
+ }
+
+ // Add a new pointer for the just seen letter
+ //
+ if (nodes[0].next[v] != ~uint32ZERO) {
+ d[nLen] = 0;
+ n[nLen++] = nodes[0].next[v];
+ }
+ }
+
+ s++;
+ spos++;
+ }
+
+ delete S;
+ }
+
+ // We should print out the total number of matches for each
+ // sequence.... Report those with matches first.
+ //
+ if (logfile) {
+ for (uint32 i=0; i<seqptrLen; i++)
+ if ((seqptr[i].reversed == false) && (nummatches[seqptr[i].seqiid] > 0))
+ fprintf(logfile, "sequence "uint32FMT" '%s' has "uint32FMT" matches.\n",
+ seqptr[i].seqiid,
+ seqptr[i].defline,
+ nummatches[seqptr[i].seqiid]);
+
+ for (uint32 i=0; i<seqptrLen; i++)
+ if ((seqptr[i].reversed == false) && (nummatches[seqptr[i].seqiid] == 0))
+ fprintf(logfile, "sequence "uint32FMT" '%s' has no matches.\n",
+ seqptr[i].seqiid,
+ seqptr[i].defline,
+ nummatches[seqptr[i].seqiid]);
+ }
+
+ delete F;
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/kmer-tools.git
More information about the debian-med-commit
mailing list